xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 49abdd8a111d9c2ef7fc48ade253ef64e07f9b37)
1d50806bdSBarry Smith /*
22499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3d50806bdSBarry Smith           C = A * B
4d50806bdSBarry Smith */
5d50806bdSBarry Smith 
6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
8c6db04a5SJed Brown #include <petscbt.h>
9af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1007475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
117bab7c10SHong Zhang 
12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13d71ae5a4SJacob Faibussowitsch {
14df97dc6dSFande Kong   PetscFunctionBegin;
15dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18df97dc6dSFande Kong }
19df97dc6dSFande Kong 
204222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
21d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22d71ae5a4SJacob Faibussowitsch {
234222ddf1SHong Zhang   PetscInt    ii;
244222ddf1SHong Zhang   Mat_SeqAIJ *aij;
259f0612e4SBarry Smith   PetscBool   isseqaij, ofree_a, ofree_ij;
265c66b693SKris Buschelman 
275c66b693SKris Buschelman   PetscFunctionBegin;
28aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
299566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat, m, n, m, n));
304222ddf1SHong Zhang 
31e4e71118SStefano Zampini   if (!mtype) {
329566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
339566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34e4e71118SStefano Zampini   } else {
359566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat, mtype));
36e4e71118SStefano Zampini   }
37cbc6b225SStefano Zampini 
3857508eceSPierre Jolivet   aij      = (Mat_SeqAIJ *)mat->data;
39cbc6b225SStefano Zampini   ofree_a  = aij->free_a;
40cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
41cbc6b225SStefano Zampini   /* changes the free flags */
429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
43cbc6b225SStefano Zampini 
449566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
459566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->imax));
479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->ilen));
48cbc6b225SStefano Zampini   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49cbc6b225SStefano Zampini     const PetscInt rnz = i[ii + 1] - i[ii];
50cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
51cbc6b225SStefano Zampini     aij->rmax     = PetscMax(aij->rmax, rnz);
52cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53cbc6b225SStefano Zampini   }
54cbc6b225SStefano Zampini   aij->maxnz = i[m];
55cbc6b225SStefano Zampini   aij->nz    = i[m];
564222ddf1SHong Zhang 
579f0612e4SBarry Smith   if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
589f0612e4SBarry Smith   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
599f0612e4SBarry Smith   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));
609f0612e4SBarry Smith 
614222ddf1SHong Zhang   aij->i       = i;
624222ddf1SHong Zhang   aij->j       = j;
634222ddf1SHong Zhang   aij->a       = a;
644222ddf1SHong Zhang   aij->nonew   = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
654222ddf1SHong Zhang   aij->free_a  = PETSC_FALSE;
664222ddf1SHong Zhang   aij->free_ij = PETSC_FALSE;
679566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
686dc08606SJunchao Zhang   // Always build the diag info when i, j are set
696dc08606SJunchao Zhang   PetscCall(MatMarkDiagonal_SeqAIJ(mat));
703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
715c66b693SKris Buschelman }
721c24bd37SHong Zhang 
73d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
74d71ae5a4SJacob Faibussowitsch {
754222ddf1SHong Zhang   Mat_Product        *product = C->product;
764222ddf1SHong Zhang   MatProductAlgorithm alg;
774222ddf1SHong Zhang   PetscBool           flg;
784222ddf1SHong Zhang 
794222ddf1SHong Zhang   PetscFunctionBegin;
804222ddf1SHong Zhang   if (product) {
814222ddf1SHong Zhang     alg = product->alg;
824222ddf1SHong Zhang   } else {
834222ddf1SHong Zhang     alg = "sorted";
844222ddf1SHong Zhang   }
854222ddf1SHong Zhang   /* sorted */
869566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "sorted", &flg));
874222ddf1SHong Zhang   if (flg) {
889566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
893ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
904222ddf1SHong Zhang   }
914222ddf1SHong Zhang 
924222ddf1SHong Zhang   /* scalable */
939566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
944222ddf1SHong Zhang   if (flg) {
959566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
963ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
974222ddf1SHong Zhang   }
984222ddf1SHong Zhang 
994222ddf1SHong Zhang   /* scalable_fast */
1009566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
1014222ddf1SHong Zhang   if (flg) {
1029566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
1033ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1044222ddf1SHong Zhang   }
1054222ddf1SHong Zhang 
1064222ddf1SHong Zhang   /* heap */
1079566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "heap", &flg));
1084222ddf1SHong Zhang   if (flg) {
1099566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
1103ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1114222ddf1SHong Zhang   }
1124222ddf1SHong Zhang 
1134222ddf1SHong Zhang   /* btheap */
1149566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "btheap", &flg));
1154222ddf1SHong Zhang   if (flg) {
1169566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
1173ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1184222ddf1SHong Zhang   }
1194222ddf1SHong Zhang 
1204222ddf1SHong Zhang   /* llcondensed */
1219566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
1224222ddf1SHong Zhang   if (flg) {
1239566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
1243ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1254222ddf1SHong Zhang   }
1264222ddf1SHong Zhang 
1274222ddf1SHong Zhang   /* rowmerge */
1289566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
1294222ddf1SHong Zhang   if (flg) {
1309566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
1313ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1324222ddf1SHong Zhang   }
1334222ddf1SHong Zhang 
1344222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1359566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
1364222ddf1SHong Zhang   if (flg) {
1379566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
1383ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1394222ddf1SHong Zhang   }
1404222ddf1SHong Zhang #endif
1414222ddf1SHong Zhang 
1424222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1434222ddf1SHong Zhang }
1444222ddf1SHong Zhang 
145d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
146d71ae5a4SJacob Faibussowitsch {
147b561aa9dSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
148971236abSHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
149c1ba5575SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
150b561aa9dSHong Zhang   PetscReal          afill;
151eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
152eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
153fb908031SHong Zhang   PetscBT            lnkbt;
1540298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
155b561aa9dSHong Zhang 
156b561aa9dSHong Zhang   PetscFunctionBegin;
157bd958071SHong Zhang   /* Get ci and cj */
158fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
159fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
161fb908031SHong Zhang   ci[0] = 0;
162fb908031SHong Zhang 
163fb908031SHong Zhang   /* create and initialize a linked list */
164eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
165c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
166eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
167eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
168eca6b86aSHong Zhang 
1699566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
170fb908031SHong Zhang 
171fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1729566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
1732205254eSKarl Rupp 
174fb908031SHong Zhang   current_space = free_space;
175fb908031SHong Zhang 
176fb908031SHong Zhang   /* Determine ci and cj */
177fb908031SHong Zhang   for (i = 0; i < am; i++) {
178fb908031SHong Zhang     anzi = ai[i + 1] - ai[i];
179fb908031SHong Zhang     aj   = a->j + ai[i];
180fb908031SHong Zhang     for (j = 0; j < anzi; j++) {
181fb908031SHong Zhang       brow = aj[j];
182fb908031SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
183fb908031SHong Zhang       bj   = b->j + bi[brow];
184fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1859566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
186fb908031SHong Zhang     }
1878c78258cSHong Zhang     /* add possible missing diagonal entry */
18848a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
189fb908031SHong Zhang     cnzi = lnk[0];
190fb908031SHong Zhang 
191fb908031SHong Zhang     /* If free space is not available, make more free space */
192fb908031SHong Zhang     /* Double the amount of total space in the list */
193fb908031SHong Zhang     if (current_space->local_remaining < cnzi) {
1949566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
195fb908031SHong Zhang       ndouble++;
196fb908031SHong Zhang     }
197fb908031SHong Zhang 
198fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
1999566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
2002205254eSKarl Rupp 
201fb908031SHong Zhang     current_space->array += cnzi;
202fb908031SHong Zhang     current_space->local_used += cnzi;
203fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2042205254eSKarl Rupp 
205fb908031SHong Zhang     ci[i + 1] = ci[i] + cnzi;
206fb908031SHong Zhang   }
207fb908031SHong Zhang 
208fb908031SHong Zhang   /* Column indices are in the list of free space */
209fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
210fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2119566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
2129566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
2139566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
214b561aa9dSHong Zhang 
21526be0446SHong Zhang   /* put together the new symbolic matrix */
2169566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
2179566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
21858c24d83SHong Zhang 
21958c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
22058c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
221f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
222aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
223e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22458c24d83SHong Zhang   c->nonew   = 0;
2254222ddf1SHong Zhang 
2264222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2274222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2280b7e3e3dSHong Zhang 
2297212da7cSHong Zhang   /* set MatInfo */
2307212da7cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
231f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2324222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2334222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2344222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2357212da7cSHong Zhang 
2367212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2377212da7cSHong Zhang   if (ci[am]) {
2389566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
2399566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
240f2b054eeSHong Zhang   } else {
2419566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
242be0fcf8dSHong Zhang   }
243f2b054eeSHong Zhang #endif
2443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24558c24d83SHong Zhang }
246d50806bdSBarry Smith 
247d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
248d71ae5a4SJacob Faibussowitsch {
249d13dce4bSSatish Balay   PetscLogDouble     flops = 0.0;
250d50806bdSBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
251d50806bdSBarry Smith   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
252d50806bdSBarry Smith   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
25338baddfdSBarry Smith   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
254c58ca2e3SHong Zhang   PetscInt           am = A->rmap->n, cm = C->rmap->n;
255519eb980SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
2562e5835c6SStefano Zampini   PetscScalar       *ca, valtmp;
257aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2586718818eSStefano Zampini   PetscContainer     cab_dense;
2592e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
260d50806bdSBarry Smith 
261d50806bdSBarry Smith   PetscFunctionBegin;
2629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
2639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
264aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2659566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
266aa1aec99SHong Zhang     c->a      = ca;
267aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2686718818eSStefano Zampini   } else ca = c->a;
2696718818eSStefano Zampini 
2706718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2716718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2726718818eSStefano Zampini      that is hard to eradicate) */
2739566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
2746718818eSStefano Zampini   if (!cab_dense) {
2759566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
276*49abdd8aSBarry Smith     PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault));
27703e76207SPierre Jolivet   } else PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
2789566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
279aa1aec99SHong Zhang 
280c124e916SHong Zhang   /* clean old values in C */
2819566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
282d50806bdSBarry Smith   /* Traverse A row-wise. */
283d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
284d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
285d50806bdSBarry Smith   for (i = 0; i < am; i++) {
286d50806bdSBarry Smith     anzi = ai[i + 1] - ai[i];
287d50806bdSBarry Smith     for (j = 0; j < anzi; j++) {
288519eb980SHong Zhang       brow = aj[j];
289d50806bdSBarry Smith       bnzi = bi[brow + 1] - bi[brow];
2903b51c7ffSStefano Zampini       bjj  = PetscSafePointerPlusOffset(bj, bi[brow]);
2913b51c7ffSStefano Zampini       baj  = PetscSafePointerPlusOffset(ba, bi[brow]);
292519eb980SHong Zhang       /* perform dense axpy */
29336ec6d2dSHong Zhang       valtmp = aa[j];
294ad540459SPierre Jolivet       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
295519eb980SHong Zhang       flops += 2 * bnzi;
296519eb980SHong Zhang     }
2978e3a54c0SPierre Jolivet     aj = PetscSafePointerPlusOffset(aj, anzi);
2988e3a54c0SPierre Jolivet     aa = PetscSafePointerPlusOffset(aa, anzi);
299c58ca2e3SHong Zhang 
300c58ca2e3SHong Zhang     cnzi = ci[i + 1] - ci[i];
301519eb980SHong Zhang     for (k = 0; k < cnzi; k++) {
302519eb980SHong Zhang       ca[k] += ab_dense[cj[k]];
303519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
304519eb980SHong Zhang     }
305519eb980SHong Zhang     flops += cnzi;
3068e3a54c0SPierre Jolivet     cj = PetscSafePointerPlusOffset(cj, cnzi);
3079371c9d4SSatish Balay     ca += cnzi;
308519eb980SHong Zhang   }
3092e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3102e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3112e5835c6SStefano Zampini #endif
3129566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3139566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3149566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
318c58ca2e3SHong Zhang }
319c58ca2e3SHong Zhang 
320d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
321d71ae5a4SJacob Faibussowitsch {
322c58ca2e3SHong Zhang   PetscLogDouble     flops = 0.0;
323c58ca2e3SHong Zhang   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
324c58ca2e3SHong Zhang   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
325c58ca2e3SHong Zhang   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
326c58ca2e3SHong Zhang   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
327c58ca2e3SHong Zhang   PetscInt           am = A->rmap->N, cm = C->rmap->N;
328c58ca2e3SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
3292e5835c6SStefano Zampini   PetscScalar       *ca = c->a, valtmp;
3302e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
331c58ca2e3SHong Zhang   PetscInt           nextb;
332c58ca2e3SHong Zhang 
333c58ca2e3SHong Zhang   PetscFunctionBegin;
3349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
3359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
336cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3379566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
338cf742fccSHong Zhang     c->a      = ca;
339cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
340cf742fccSHong Zhang   }
341cf742fccSHong Zhang 
342c58ca2e3SHong Zhang   /* clean old values in C */
3439566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
344c58ca2e3SHong Zhang   /* Traverse A row-wise. */
345c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
346c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
347519eb980SHong Zhang   for (i = 0; i < am; i++) {
348519eb980SHong Zhang     anzi = ai[i + 1] - ai[i];
349519eb980SHong Zhang     cnzi = ci[i + 1] - ci[i];
350519eb980SHong Zhang     for (j = 0; j < anzi; j++) {
351519eb980SHong Zhang       brow = aj[j];
352519eb980SHong Zhang       bnzi = bi[brow + 1] - bi[brow];
353519eb980SHong Zhang       bjj  = bj + bi[brow];
354519eb980SHong Zhang       baj  = ba + bi[brow];
355519eb980SHong Zhang       /* perform sparse axpy */
35636ec6d2dSHong Zhang       valtmp = aa[j];
357c124e916SHong Zhang       nextb  = 0;
358c124e916SHong Zhang       for (k = 0; nextb < bnzi; k++) {
359c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
36036ec6d2dSHong Zhang           ca[k] += valtmp * baj[nextb++];
361c124e916SHong Zhang         }
362d50806bdSBarry Smith       }
363d50806bdSBarry Smith       flops += 2 * bnzi;
364d50806bdSBarry Smith     }
3659371c9d4SSatish Balay     aj += anzi;
3669371c9d4SSatish Balay     aa += anzi;
3679371c9d4SSatish Balay     cj += cnzi;
3689371c9d4SSatish Balay     ca += cnzi;
369d50806bdSBarry Smith   }
3702e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3712e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3722e5835c6SStefano Zampini #endif
3739566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3749566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3759566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
379d50806bdSBarry Smith }
380bc011b1eSHong Zhang 
381d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
382d71ae5a4SJacob Faibussowitsch {
38325296bd5SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
38425296bd5SBarry Smith   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
3853c50cad2SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
38625296bd5SBarry Smith   MatScalar         *ca;
38725296bd5SBarry Smith   PetscReal          afill;
388eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
389eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
3900298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
39125296bd5SBarry Smith 
39225296bd5SBarry Smith   PetscFunctionBegin;
3933c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3943c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
3959566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
3963c50cad2SHong Zhang   ci[0] = 0;
3973c50cad2SHong Zhang 
3983c50cad2SHong Zhang   /* create and initialize a linked list */
399eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
400c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
401eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
402eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
403eca6b86aSHong Zhang 
4049566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
4053c50cad2SHong Zhang 
4063c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4079566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
4083c50cad2SHong Zhang   current_space = free_space;
4093c50cad2SHong Zhang 
4103c50cad2SHong Zhang   /* Determine ci and cj */
4113c50cad2SHong Zhang   for (i = 0; i < am; i++) {
4123c50cad2SHong Zhang     anzi = ai[i + 1] - ai[i];
4133c50cad2SHong Zhang     aj   = a->j + ai[i];
4143c50cad2SHong Zhang     for (j = 0; j < anzi; j++) {
4153c50cad2SHong Zhang       brow = aj[j];
4163c50cad2SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
4173c50cad2SHong Zhang       bj   = b->j + bi[brow];
4183c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4199566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
4203c50cad2SHong Zhang     }
4218c78258cSHong Zhang     /* add possible missing diagonal entry */
42248a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
4233c50cad2SHong Zhang     cnzi = lnk[1];
4243c50cad2SHong Zhang 
4253c50cad2SHong Zhang     /* If free space is not available, make more free space */
4263c50cad2SHong Zhang     /* Double the amount of total space in the list */
4273c50cad2SHong Zhang     if (current_space->local_remaining < cnzi) {
4289566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
4293c50cad2SHong Zhang       ndouble++;
4303c50cad2SHong Zhang     }
4313c50cad2SHong Zhang 
4323c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4339566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
4342205254eSKarl Rupp 
4353c50cad2SHong Zhang     current_space->array += cnzi;
4363c50cad2SHong Zhang     current_space->local_used += cnzi;
4373c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4382205254eSKarl Rupp 
4393c50cad2SHong Zhang     ci[i + 1] = ci[i] + cnzi;
4403c50cad2SHong Zhang   }
4413c50cad2SHong Zhang 
4423c50cad2SHong Zhang   /* Column indices are in the list of free space */
4433c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4443c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
4469566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
4479566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
44825296bd5SBarry Smith 
44925296bd5SBarry Smith   /* Allocate space for ca */
4509566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
45125296bd5SBarry Smith 
45225296bd5SBarry Smith   /* put together the new symbolic matrix */
4539566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
4549566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
45525296bd5SBarry Smith 
45625296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
45725296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
458f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
45925296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
46025296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
46125296bd5SBarry Smith   c->nonew   = 0;
4622205254eSKarl Rupp 
4634222ddf1SHong Zhang   /* slower, less memory */
4644222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
46525296bd5SBarry Smith 
46625296bd5SBarry Smith   /* set MatInfo */
46725296bd5SBarry Smith   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
46825296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4694222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4704222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4714222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
47225296bd5SBarry Smith 
47325296bd5SBarry Smith #if defined(PETSC_USE_INFO)
47425296bd5SBarry Smith   if (ci[am]) {
4759566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
4769566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
47725296bd5SBarry Smith   } else {
4789566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
47925296bd5SBarry Smith   }
48025296bd5SBarry Smith #endif
4813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
48225296bd5SBarry Smith }
48325296bd5SBarry Smith 
484d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
485d71ae5a4SJacob Faibussowitsch {
486e9e4536cSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
487bf9555e6SHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
48825c41797SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
489e9e4536cSHong Zhang   MatScalar         *ca;
490e9e4536cSHong Zhang   PetscReal          afill;
491eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
492eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
4930298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
494e9e4536cSHong Zhang 
495e9e4536cSHong Zhang   PetscFunctionBegin;
496bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
497bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
499bd958071SHong Zhang   ci[0] = 0;
500bd958071SHong Zhang 
501bd958071SHong Zhang   /* create and initialize a linked list */
502eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
503c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
504eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
505eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
5069566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
507bd958071SHong Zhang 
508bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5099566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
510bd958071SHong Zhang   current_space = free_space;
511bd958071SHong Zhang 
512bd958071SHong Zhang   /* Determine ci and cj */
513bd958071SHong Zhang   for (i = 0; i < am; i++) {
514bd958071SHong Zhang     anzi = ai[i + 1] - ai[i];
515bd958071SHong Zhang     aj   = a->j + ai[i];
516bd958071SHong Zhang     for (j = 0; j < anzi; j++) {
517bd958071SHong Zhang       brow = aj[j];
518bd958071SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
519bd958071SHong Zhang       bj   = b->j + bi[brow];
520bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5219566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
522bd958071SHong Zhang     }
5238c78258cSHong Zhang     /* add possible missing diagonal entry */
52448a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
5258c78258cSHong Zhang 
526bd958071SHong Zhang     cnzi = lnk[0];
527bd958071SHong Zhang 
528bd958071SHong Zhang     /* If free space is not available, make more free space */
529bd958071SHong Zhang     /* Double the amount of total space in the list */
530bd958071SHong Zhang     if (current_space->local_remaining < cnzi) {
5319566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
532bd958071SHong Zhang       ndouble++;
533bd958071SHong Zhang     }
534bd958071SHong Zhang 
535bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5369566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
5372205254eSKarl Rupp 
538bd958071SHong Zhang     current_space->array += cnzi;
539bd958071SHong Zhang     current_space->local_used += cnzi;
540bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5412205254eSKarl Rupp 
542bd958071SHong Zhang     ci[i + 1] = ci[i] + cnzi;
543bd958071SHong Zhang   }
544bd958071SHong Zhang 
545bd958071SHong Zhang   /* Column indices are in the list of free space */
546bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
547bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
5499566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
5509566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
551e9e4536cSHong Zhang 
552e9e4536cSHong Zhang   /* Allocate space for ca */
5539566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
554e9e4536cSHong Zhang 
555e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5569566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
5579566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
558e9e4536cSHong Zhang 
559e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
560e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
561f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
562e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
563e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
564e9e4536cSHong Zhang   c->nonew   = 0;
5652205254eSKarl Rupp 
5664222ddf1SHong Zhang   /* slower, less memory */
5674222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
568e9e4536cSHong Zhang 
569e9e4536cSHong Zhang   /* set MatInfo */
570e9e4536cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
571e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5724222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5734222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5744222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
575e9e4536cSHong Zhang 
576e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
577e9e4536cSHong Zhang   if (ci[am]) {
5789566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
5799566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
580e9e4536cSHong Zhang   } else {
5819566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
582e9e4536cSHong Zhang   }
583e9e4536cSHong Zhang #endif
5843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
585e9e4536cSHong Zhang }
586e9e4536cSHong Zhang 
587d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
588d71ae5a4SJacob Faibussowitsch {
5890ced3a2bSJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
5900ced3a2bSJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
5910ced3a2bSJed Brown   PetscInt          *ci, *cj, *bb;
5920ced3a2bSJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
5930ced3a2bSJed Brown   PetscReal          afill;
5940ced3a2bSJed Brown   PetscInt           i, j, col, ndouble = 0;
5950298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
5960ced3a2bSJed Brown   PetscHeap          h;
5970ced3a2bSJed Brown 
5980ced3a2bSJed Brown   PetscFunctionBegin;
599cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6000ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
6020ced3a2bSJed Brown   ci[0] = 0;
6030ced3a2bSJed Brown 
6040ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6059566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
6060ced3a2bSJed Brown   current_space = free_space;
6070ced3a2bSJed Brown 
6089566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
6099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
6100ced3a2bSJed Brown 
6110ced3a2bSJed Brown   /* Determine ci and cj */
6120ced3a2bSJed Brown   for (i = 0; i < am; i++) {
6130ced3a2bSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6140ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
6150ced3a2bSJed Brown     ci[i + 1]            = ci[i];
6160ced3a2bSJed Brown     /* Populate the min heap */
6170ced3a2bSJed Brown     for (j = 0; j < anzi; j++) {
6180ced3a2bSJed Brown       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
6190ced3a2bSJed Brown       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
6209566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
6210ced3a2bSJed Brown       }
6220ced3a2bSJed Brown     }
6230ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6249566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
6250ced3a2bSJed Brown     while (j >= 0) {
6260ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6279566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
6280ced3a2bSJed Brown         ndouble++;
6290ced3a2bSJed Brown       }
6300ced3a2bSJed Brown       *(current_space->array++) = col;
6310ced3a2bSJed Brown       current_space->local_used++;
6320ced3a2bSJed Brown       current_space->local_remaining--;
6330ced3a2bSJed Brown       ci[i + 1]++;
6340ced3a2bSJed Brown 
6350ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6369566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
6370ced3a2bSJed Brown       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
6380ced3a2bSJed Brown         PetscInt j2, col2;
6399566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h, &j2, &col2));
6400ced3a2bSJed Brown         if (col2 != col) break;
6419566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h, &j2, &col2));
6429566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
6430ced3a2bSJed Brown       }
6440ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6459566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6469566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
6470ced3a2bSJed Brown     }
6480ced3a2bSJed Brown   }
6499566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6509566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6510ced3a2bSJed Brown 
6520ced3a2bSJed Brown   /* Column indices are in the list of free space */
6530ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6540ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6559566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
6569566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
6570ced3a2bSJed Brown 
6580ced3a2bSJed Brown   /* put together the new symbolic matrix */
6599566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
6609566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6610ced3a2bSJed Brown 
6620ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6630ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
664f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
6650ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6660ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6670ced3a2bSJed Brown   c->nonew   = 0;
66826fbe8dcSKarl Rupp 
6694222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6700ced3a2bSJed Brown 
6710ced3a2bSJed Brown   /* set MatInfo */
6720ced3a2bSJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
6730ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6744222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6754222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6764222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6770ced3a2bSJed Brown 
6780ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6790ced3a2bSJed Brown   if (ci[am]) {
6809566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
6819566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
6820ced3a2bSJed Brown   } else {
6839566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
6840ced3a2bSJed Brown   }
6850ced3a2bSJed Brown #endif
6863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6870ced3a2bSJed Brown }
688e9e4536cSHong Zhang 
689d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
690d71ae5a4SJacob Faibussowitsch {
6918a07c6f1SJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
6928a07c6f1SJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
6938a07c6f1SJed Brown   PetscInt          *ci, *cj, *bb;
6948a07c6f1SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
6958a07c6f1SJed Brown   PetscReal          afill;
6968a07c6f1SJed Brown   PetscInt           i, j, col, ndouble = 0;
6970298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
6988a07c6f1SJed Brown   PetscHeap          h;
6998a07c6f1SJed Brown   PetscBT            bt;
7008a07c6f1SJed Brown 
7018a07c6f1SJed Brown   PetscFunctionBegin;
702cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7038a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7049566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
7058a07c6f1SJed Brown   ci[0] = 0;
7068a07c6f1SJed Brown 
7078a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7089566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
7092205254eSKarl Rupp 
7108a07c6f1SJed Brown   current_space = free_space;
7118a07c6f1SJed Brown 
7129566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
7139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
7149566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn, &bt));
7158a07c6f1SJed Brown 
7168a07c6f1SJed Brown   /* Determine ci and cj */
7178a07c6f1SJed Brown   for (i = 0; i < am; i++) {
7188a07c6f1SJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7198a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
7208a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7218a07c6f1SJed Brown     ci[i + 1]            = ci[i];
7228a07c6f1SJed Brown     /* Populate the min heap */
7238a07c6f1SJed Brown     for (j = 0; j < anzi; j++) {
7248a07c6f1SJed Brown       PetscInt brow = acol[j];
7258a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
7268a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7278a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7289566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7298a07c6f1SJed Brown           bb[j]++;
7308a07c6f1SJed Brown           break;
7318a07c6f1SJed Brown         }
7328a07c6f1SJed Brown       }
7338a07c6f1SJed Brown     }
7348a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7359566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
7368a07c6f1SJed Brown     while (j >= 0) {
7378a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7380298fd71SBarry Smith         fptr = NULL;                            /* need PetscBTMemzero */
7399566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
7408a07c6f1SJed Brown         ndouble++;
7418a07c6f1SJed Brown       }
7428a07c6f1SJed Brown       *(current_space->array++) = col;
7438a07c6f1SJed Brown       current_space->local_used++;
7448a07c6f1SJed Brown       current_space->local_remaining--;
7458a07c6f1SJed Brown       ci[i + 1]++;
7468a07c6f1SJed Brown 
7478a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7488a07c6f1SJed Brown       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
7498a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7508a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7519566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7528a07c6f1SJed Brown           bb[j]++;
7538a07c6f1SJed Brown           break;
7548a07c6f1SJed Brown         }
7558a07c6f1SJed Brown       }
7569566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
7578a07c6f1SJed Brown     }
7588a07c6f1SJed Brown     if (fptr) { /* Clear the bits for this row */
7599566063dSJacob Faibussowitsch       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
7608a07c6f1SJed Brown     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7619566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn, bt));
7628a07c6f1SJed Brown     }
7638a07c6f1SJed Brown   }
7649566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7659566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7669566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7678a07c6f1SJed Brown 
7688a07c6f1SJed Brown   /* Column indices are in the list of free space */
7698a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7708a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
7729566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
7738a07c6f1SJed Brown 
7748a07c6f1SJed Brown   /* put together the new symbolic matrix */
7759566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
7769566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
7778a07c6f1SJed Brown 
7788a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7798a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
780f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
7818a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7828a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7838a07c6f1SJed Brown   c->nonew   = 0;
78426fbe8dcSKarl Rupp 
7854222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7868a07c6f1SJed Brown 
7878a07c6f1SJed Brown   /* set MatInfo */
7888a07c6f1SJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
7898a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7904222ddf1SHong Zhang   C->info.mallocs           = ndouble;
7914222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
7924222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
7938a07c6f1SJed Brown 
7948a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7958a07c6f1SJed Brown   if (ci[am]) {
7969566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
7979566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
7988a07c6f1SJed Brown   } else {
7999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
8008a07c6f1SJed Brown   }
8018a07c6f1SJed Brown #endif
8023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8038a07c6f1SJed Brown }
8048a07c6f1SJed Brown 
805d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
806d71ae5a4SJacob Faibussowitsch {
807d7ed1a4dSandi selinger   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
808d7ed1a4dSandi selinger   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
809d7ed1a4dSandi selinger   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
810d7ed1a4dSandi selinger   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
811d7ed1a4dSandi selinger   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
812d7ed1a4dSandi selinger   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
813d7ed1a4dSandi selinger   const PetscInt *brow_ptr[8], *brow_end[8];
814d7ed1a4dSandi selinger   PetscInt        window[8];
815d7ed1a4dSandi selinger   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
816d7ed1a4dSandi selinger   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
817d7ed1a4dSandi selinger   PetscReal       afill;
818f83700f2Sandi selinger   PetscInt       *workj_L1, *workj_L2, *workj_L3;
8197660c4dbSandi selinger   PetscInt        L1_nnz, L2_nnz;
820d7ed1a4dSandi selinger 
821d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
822d7ed1a4dSandi selinger              Because of the way virtual memory works,
823d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
824d7ed1a4dSandi selinger   PetscFunctionBegin;
8259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
826d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
827d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
828d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
829d7ed1a4dSandi selinger     a_rownnz             = 0;
830d7ed1a4dSandi selinger     for (k = 0; k < anzi; ++k) {
831d7ed1a4dSandi selinger       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
832d7ed1a4dSandi selinger       if (a_rownnz > bn) {
833d7ed1a4dSandi selinger         a_rownnz = bn;
834d7ed1a4dSandi selinger         break;
835d7ed1a4dSandi selinger       }
836d7ed1a4dSandi selinger     }
837d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
838d7ed1a4dSandi selinger   }
839d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
8419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
8429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
843d7ed1a4dSandi selinger 
8447660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8457660c4dbSandi selinger   c_maxmem = 8 * (ai[am] + bi[bm]);
846d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem, &cj));
848d7ed1a4dSandi selinger 
849d7ed1a4dSandi selinger   ci_nnz      = 0;
850d7ed1a4dSandi selinger   ci[0]       = 0;
851d7ed1a4dSandi selinger   worki_L1[0] = 0;
852d7ed1a4dSandi selinger   worki_L2[0] = 0;
853d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
854d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
855d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
856d7ed1a4dSandi selinger     rowsleft             = anzi;
857d7ed1a4dSandi selinger     inputcol_L1          = acol;
858d7ed1a4dSandi selinger     L2_nnz               = 0;
8597660c4dbSandi selinger     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8607660c4dbSandi selinger     worki_L2[1]          = 0;
86108fe1b3cSKarl Rupp     outputi_nnz          = 0;
862d7ed1a4dSandi selinger 
863d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
864d7ed1a4dSandi selinger     while (ci_nnz + a_maxrownnz > c_maxmem) {
865d7ed1a4dSandi selinger       c_maxmem *= 2;
866d7ed1a4dSandi selinger       ndouble++;
8679566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
868d7ed1a4dSandi selinger     }
869d7ed1a4dSandi selinger 
870d7ed1a4dSandi selinger     while (rowsleft) {
871d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
872d7ed1a4dSandi selinger       L1_nrows    = 0;
8737660c4dbSandi selinger       L1_nnz      = 0;
874d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
875d7ed1a4dSandi selinger       inputi      = bi;
876d7ed1a4dSandi selinger       inputj      = bj;
877d7ed1a4dSandi selinger 
878d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
879d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
880f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
881d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
882d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
883a8f51744SPierre Jolivet   do { \
884d7ed1a4dSandi selinger     window_min  = bn; \
8857660c4dbSandi selinger     outputi_nnz = 0; \
8867660c4dbSandi selinger     for (k = 0; k < ANNZ; ++k) { \
887d7ed1a4dSandi selinger       brow_ptr[k] = inputj + inputi[inputcol[k]]; \
888d7ed1a4dSandi selinger       brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
889d7ed1a4dSandi selinger       window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
890d7ed1a4dSandi selinger       window_min  = PetscMin(window[k], window_min); \
891d7ed1a4dSandi selinger     } \
892d7ed1a4dSandi selinger     while (window_min < bn) { \
893d7ed1a4dSandi selinger       outputj[outputi_nnz++] = window_min; \
894d7ed1a4dSandi selinger       /* advance front and compute new minimum */ \
895d7ed1a4dSandi selinger       old_window_min = window_min; \
896d7ed1a4dSandi selinger       window_min     = bn; \
897d7ed1a4dSandi selinger       for (k = 0; k < ANNZ; ++k) { \
898d7ed1a4dSandi selinger         if (window[k] == old_window_min) { \
899d7ed1a4dSandi selinger           brow_ptr[k]++; \
900d7ed1a4dSandi selinger           window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
901d7ed1a4dSandi selinger         } \
902d7ed1a4dSandi selinger         window_min = PetscMin(window[k], window_min); \
903d7ed1a4dSandi selinger       } \
904a8f51744SPierre Jolivet     } \
905a8f51744SPierre Jolivet   } while (0)
906d7ed1a4dSandi selinger 
907d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
908d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
909d7ed1a4dSandi selinger       while (L1_rowsleft) {
9107660c4dbSandi selinger         outputi_nnz = 0;
9117660c4dbSandi selinger         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
9127660c4dbSandi selinger         else outputj = cj + ci_nnz;                /* Merge directly to C */
9137660c4dbSandi selinger 
914d7ed1a4dSandi selinger         switch (L1_rowsleft) {
9159371c9d4SSatish Balay         case 1:
9169371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
917d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
918d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
919d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
920d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
921d7ed1a4dSandi selinger           L1_rowsleft = 0;
922d7ed1a4dSandi selinger           break;
9239371c9d4SSatish Balay         case 2:
9249371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(2);
925d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
926d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
927d7ed1a4dSandi selinger           L1_rowsleft = 0;
928d7ed1a4dSandi selinger           break;
9299371c9d4SSatish Balay         case 3:
9309371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(3);
931d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
932d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
933d7ed1a4dSandi selinger           L1_rowsleft = 0;
934d7ed1a4dSandi selinger           break;
9359371c9d4SSatish Balay         case 4:
9369371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(4);
937d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
938d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
939d7ed1a4dSandi selinger           L1_rowsleft = 0;
940d7ed1a4dSandi selinger           break;
9419371c9d4SSatish Balay         case 5:
9429371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(5);
943d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
944d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
945d7ed1a4dSandi selinger           L1_rowsleft = 0;
946d7ed1a4dSandi selinger           break;
9479371c9d4SSatish Balay         case 6:
9489371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(6);
949d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
950d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
951d7ed1a4dSandi selinger           L1_rowsleft = 0;
952d7ed1a4dSandi selinger           break;
9539371c9d4SSatish Balay         case 7:
9549371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(7);
955d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
956d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
957d7ed1a4dSandi selinger           L1_rowsleft = 0;
958d7ed1a4dSandi selinger           break;
9599371c9d4SSatish Balay         default:
9609371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(8);
961d7ed1a4dSandi selinger           inputcol += 8;
962d7ed1a4dSandi selinger           rowsleft -= 8;
963d7ed1a4dSandi selinger           L1_rowsleft -= 8;
964d7ed1a4dSandi selinger           break;
965d7ed1a4dSandi selinger         }
966d7ed1a4dSandi selinger         inputcol_L1 = inputcol;
9677660c4dbSandi selinger         L1_nnz += outputi_nnz;
9687660c4dbSandi selinger         worki_L1[++L1_nrows] = L1_nnz;
969d7ed1a4dSandi selinger       }
970d7ed1a4dSandi selinger 
971d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
972d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
973d7ed1a4dSandi selinger       if (anzi > 8) {
974d7ed1a4dSandi selinger         inputi      = worki_L1;
975d7ed1a4dSandi selinger         inputj      = workj_L1;
976d7ed1a4dSandi selinger         inputcol    = workcol;
977d7ed1a4dSandi selinger         outputi_nnz = 0;
978d7ed1a4dSandi selinger 
979d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
980d7ed1a4dSandi selinger         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
981d7ed1a4dSandi selinger 
982d7ed1a4dSandi selinger         switch (L1_nrows) {
9839371c9d4SSatish Balay         case 1:
9849371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
985d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
986d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
987d7ed1a4dSandi selinger           break;
988d71ae5a4SJacob Faibussowitsch         case 2:
989d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(2);
990d71ae5a4SJacob Faibussowitsch           break;
991d71ae5a4SJacob Faibussowitsch         case 3:
992d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(3);
993d71ae5a4SJacob Faibussowitsch           break;
994d71ae5a4SJacob Faibussowitsch         case 4:
995d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(4);
996d71ae5a4SJacob Faibussowitsch           break;
997d71ae5a4SJacob Faibussowitsch         case 5:
998d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(5);
999d71ae5a4SJacob Faibussowitsch           break;
1000d71ae5a4SJacob Faibussowitsch         case 6:
1001d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(6);
1002d71ae5a4SJacob Faibussowitsch           break;
1003d71ae5a4SJacob Faibussowitsch         case 7:
1004d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(7);
1005d71ae5a4SJacob Faibussowitsch           break;
1006d71ae5a4SJacob Faibussowitsch         case 8:
1007d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(8);
1008d71ae5a4SJacob Faibussowitsch           break;
1009d71ae5a4SJacob Faibussowitsch         default:
1010d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1011d7ed1a4dSandi selinger         }
1012d7ed1a4dSandi selinger         L2_nnz += outputi_nnz;
1013d7ed1a4dSandi selinger         worki_L2[++L2_nrows] = L2_nnz;
1014d7ed1a4dSandi selinger 
10157660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10167660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1017d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1018d7ed1a4dSandi selinger           inputi      = worki_L2;
1019d7ed1a4dSandi selinger           inputj      = workj_L2;
1020d7ed1a4dSandi selinger           inputcol    = workcol;
1021d7ed1a4dSandi selinger           outputi_nnz = 0;
1022f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1023d7ed1a4dSandi selinger           else outputj = cj + ci_nnz;
1024d7ed1a4dSandi selinger           switch (L2_nrows) {
10259371c9d4SSatish Balay           case 1:
10269371c9d4SSatish Balay             brow_ptr[0] = inputj + inputi[inputcol[0]];
1027d7ed1a4dSandi selinger             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1028d7ed1a4dSandi selinger             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1029d7ed1a4dSandi selinger             break;
1030d71ae5a4SJacob Faibussowitsch           case 2:
1031d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(2);
1032d71ae5a4SJacob Faibussowitsch             break;
1033d71ae5a4SJacob Faibussowitsch           case 3:
1034d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(3);
1035d71ae5a4SJacob Faibussowitsch             break;
1036d71ae5a4SJacob Faibussowitsch           case 4:
1037d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(4);
1038d71ae5a4SJacob Faibussowitsch             break;
1039d71ae5a4SJacob Faibussowitsch           case 5:
1040d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(5);
1041d71ae5a4SJacob Faibussowitsch             break;
1042d71ae5a4SJacob Faibussowitsch           case 6:
1043d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(6);
1044d71ae5a4SJacob Faibussowitsch             break;
1045d71ae5a4SJacob Faibussowitsch           case 7:
1046d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(7);
1047d71ae5a4SJacob Faibussowitsch             break;
1048d71ae5a4SJacob Faibussowitsch           case 8:
1049d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(8);
1050d71ae5a4SJacob Faibussowitsch             break;
1051d71ae5a4SJacob Faibussowitsch           default:
1052d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1053d7ed1a4dSandi selinger           }
1054d7ed1a4dSandi selinger           L2_nrows    = 1;
10557660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10567660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10577660c4dbSandi selinger           /* Copy to workj_L2 */
1058d7ed1a4dSandi selinger           if (rowsleft) {
10597660c4dbSandi selinger             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1060d7ed1a4dSandi selinger           }
1061d7ed1a4dSandi selinger         }
1062d7ed1a4dSandi selinger       }
1063d7ed1a4dSandi selinger     } /* while (rowsleft) */
1064d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1065d7ed1a4dSandi selinger 
1066d7ed1a4dSandi selinger     /* terminate current row */
1067d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1068d7ed1a4dSandi selinger     ci[i + 1] = ci_nnz;
1069d7ed1a4dSandi selinger   }
1070d7ed1a4dSandi selinger 
1071d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10729566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
10739566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1074d7ed1a4dSandi selinger 
1075d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1076d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1077f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1078d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1079d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1080d7ed1a4dSandi selinger   c->nonew   = 0;
1081d7ed1a4dSandi selinger 
10824222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1083d7ed1a4dSandi selinger 
1084d7ed1a4dSandi selinger   /* set MatInfo */
1085d7ed1a4dSandi selinger   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1086d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10874222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10884222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10894222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1090d7ed1a4dSandi selinger 
1091d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1092d7ed1a4dSandi selinger   if (ci[am]) {
10939566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
10949566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1095d7ed1a4dSandi selinger   } else {
10969566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1097d7ed1a4dSandi selinger   }
1098d7ed1a4dSandi selinger #endif
1099d7ed1a4dSandi selinger 
1100d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
11019566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
11029566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
11039566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
11043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1105d7ed1a4dSandi selinger }
1106d7ed1a4dSandi selinger 
1107cd093f1dSJed Brown /* concatenate unique entries and then sort */
1108d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1109d71ae5a4SJacob Faibussowitsch {
1110cd093f1dSJed Brown   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1111cd093f1dSJed Brown   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
11128c78258cSHong Zhang   PetscInt       *ci, *cj, bcol;
1113cd093f1dSJed Brown   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1114cd093f1dSJed Brown   PetscReal       afill;
1115cd093f1dSJed Brown   PetscInt        i, j, ndouble = 0;
1116cd093f1dSJed Brown   PetscSegBuffer  seg, segrow;
1117cd093f1dSJed Brown   char           *seen;
1118cd093f1dSJed Brown 
1119cd093f1dSJed Brown   PetscFunctionBegin;
11209566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
1121cd093f1dSJed Brown   ci[0] = 0;
1122cd093f1dSJed Brown 
1123cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
11249566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
11259566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
11269566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn, &seen));
1127cd093f1dSJed Brown 
1128cd093f1dSJed Brown   /* Determine ci and cj */
1129cd093f1dSJed Brown   for (i = 0; i < am; i++) {
1130cd093f1dSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];                     /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
11318e3a54c0SPierre Jolivet     const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1132cd093f1dSJed Brown     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
11338c78258cSHong Zhang 
1134cd093f1dSJed Brown     /* Pack segrow */
1135cd093f1dSJed Brown     for (j = 0; j < anzi; j++) {
1136cd093f1dSJed Brown       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1137cd093f1dSJed Brown       for (k = bjstart; k < bjend; k++) {
11388c78258cSHong Zhang         bcol = bj[k];
1139cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1140cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11419566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1142cd093f1dSJed Brown           *slot      = bcol;
1143cd093f1dSJed Brown           seen[bcol] = 1;
1144cd093f1dSJed Brown           packlen++;
1145cd093f1dSJed Brown         }
1146cd093f1dSJed Brown       }
1147cd093f1dSJed Brown     }
11488c78258cSHong Zhang 
11498c78258cSHong Zhang     /* Check i-th diagonal entry */
11508c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11518c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11529566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
11538c78258cSHong Zhang       *slot   = i;
11548c78258cSHong Zhang       seen[i] = 1;
11558c78258cSHong Zhang       packlen++;
11568c78258cSHong Zhang     }
11578c78258cSHong Zhang 
11589566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
11599566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow, crow));
11609566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen, crow));
1161cd093f1dSJed Brown     ci[i + 1] = ci[i] + packlen;
1162cd093f1dSJed Brown     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1163cd093f1dSJed Brown   }
11649566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11659566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1166cd093f1dSJed Brown 
1167cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11689566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
11699566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1170cd093f1dSJed Brown 
1171cd093f1dSJed Brown   /* put together the new symbolic matrix */
11729566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
11739566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1174cd093f1dSJed Brown 
1175cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1176cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1177f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1178cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1179cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1180cd093f1dSJed Brown   c->nonew   = 0;
1181cd093f1dSJed Brown 
11824222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1183cd093f1dSJed Brown 
1184cd093f1dSJed Brown   /* set MatInfo */
11852a09556fSStefano Zampini   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1186cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11874222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11884222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11894222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1190cd093f1dSJed Brown 
1191cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1192cd093f1dSJed Brown   if (ci[am]) {
11939566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
11949566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1195cd093f1dSJed Brown   } else {
11969566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1197cd093f1dSJed Brown   }
1198cd093f1dSJed Brown #endif
11993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1200cd093f1dSJed Brown }
1201cd093f1dSJed Brown 
120266976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1203d71ae5a4SJacob Faibussowitsch {
12046718818eSStefano Zampini   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
12052128a86cSHong Zhang 
12062128a86cSHong Zhang   PetscFunctionBegin;
12079566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
12089566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
12099566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
12109566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
12113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12122128a86cSHong Zhang }
12132128a86cSHong Zhang 
1214d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1215d71ae5a4SJacob Faibussowitsch {
121681d82fe4SHong Zhang   Mat                  Bt;
121740192850SHong Zhang   Mat_MatMatTransMult *abt;
12184222ddf1SHong Zhang   Mat_Product         *product = C->product;
12196718818eSStefano Zampini   char                *alg;
1220d2853540SHong Zhang 
122181d82fe4SHong Zhang   PetscFunctionBegin;
122228b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
122328b400f6SJacob Faibussowitsch   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
12246718818eSStefano Zampini 
122581d82fe4SHong Zhang   /* create symbolic Bt */
12267fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B, &Bt));
12279566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
12289566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
122981d82fe4SHong Zhang 
123081d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12319566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg, &alg));
12329566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
12339566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
12349566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
12359566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
123681d82fe4SHong Zhang 
1237a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12389566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12392128a86cSHong Zhang 
12406718818eSStefano Zampini   product->data    = abt;
12416718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12426718818eSStefano Zampini 
12434222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12442128a86cSHong Zhang 
12454222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12469566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
124740192850SHong Zhang   if (abt->usecoloring) {
1248b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1249b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1250335efc43SPeter Brune     MatColoring          coloring;
12512128a86cSHong Zhang     ISColoring           iscoloring;
12522128a86cSHong Zhang     Mat                  Bt_dense, C_dense;
1253e8354b3eSHong Zhang 
12544222ddf1SHong Zhang     /* inode causes memory problem */
12559566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
12564222ddf1SHong Zhang 
12579566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C, &coloring));
12589566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring, 2));
12599566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
12609566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12619566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring, &iscoloring));
12629566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12639566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
12642205254eSKarl Rupp 
126540192850SHong Zhang     abt->matcoloring = matcoloring;
12662205254eSKarl Rupp 
12679566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12682128a86cSHong Zhang 
12692128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12709566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
12719566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
12729566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
12739566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
12742205254eSKarl Rupp 
12752128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
127640192850SHong Zhang     abt->Bt_den         = Bt_dense;
12772128a86cSHong Zhang 
12789566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
12799566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
12809566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense, MATSEQDENSE));
12819566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
12822205254eSKarl Rupp 
12832128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
128440192850SHong Zhang     abt->ABt_den        = C_dense;
1285f94ccd6cSHong Zhang 
1286f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12871ce71dffSSatish Balay     {
12884222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
12899371c9d4SSatish Balay       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1290f4f49eeaSPierre Jolivet                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
12911ce71dffSSatish Balay     }
1292f94ccd6cSHong Zhang #endif
12932128a86cSHong Zhang   }
1294e99be685SHong Zhang   /* clean up */
12959566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
12963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12975df89d91SHong Zhang }
12985df89d91SHong Zhang 
1299d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1300d71ae5a4SJacob Faibussowitsch {
13015df89d91SHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1302e2cac8caSJed Brown   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
130381d82fe4SHong Zhang   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
13045df89d91SHong Zhang   PetscLogDouble       flops = 0.0;
1305aa1aec99SHong Zhang   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
13066718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13076718818eSStefano Zampini   Mat_Product         *product = C->product;
13085df89d91SHong Zhang 
13095df89d91SHong Zhang   PetscFunctionBegin;
131028b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
13116718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
131228b400f6SJacob Faibussowitsch   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
131358ed3ceaSHong Zhang   /* clear old values in C */
131458ed3ceaSHong Zhang   if (!c->a) {
13159566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
131658ed3ceaSHong Zhang     c->a      = ca;
131758ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
131858ed3ceaSHong Zhang   } else {
131958ed3ceaSHong Zhang     ca = c->a;
13209566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
132158ed3ceaSHong Zhang   }
132258ed3ceaSHong Zhang 
132340192850SHong Zhang   if (abt->usecoloring) {
132440192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
132540192850SHong Zhang     Mat                  Bt_dense, C_dense = abt->ABt_den;
1326c8db22f6SHong Zhang 
1327b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
132840192850SHong Zhang     Bt_dense = abt->Bt_den;
13299566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1330c8db22f6SHong Zhang 
1331c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13329566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1333c8db22f6SHong Zhang 
13342128a86cSHong Zhang     /* Recover C from C_dense */
13359566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
13363ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1337c8db22f6SHong Zhang   }
1338c8db22f6SHong Zhang 
13391fa1209cSHong Zhang   for (i = 0; i < cm; i++) {
134081d82fe4SHong Zhang     anzi = ai[i + 1] - ai[i];
13418e3a54c0SPierre Jolivet     acol = PetscSafePointerPlusOffset(aj, ai[i]);
13428e3a54c0SPierre Jolivet     aval = PetscSafePointerPlusOffset(aa, ai[i]);
13431fa1209cSHong Zhang     cnzi = ci[i + 1] - ci[i];
13448e3a54c0SPierre Jolivet     ccol = PetscSafePointerPlusOffset(cj, ci[i]);
13456973769bSHong Zhang     cval = ca + ci[i];
13461fa1209cSHong Zhang     for (j = 0; j < cnzi; j++) {
134781d82fe4SHong Zhang       brow = ccol[j];
134881d82fe4SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
134981d82fe4SHong Zhang       bcol = bj + bi[brow];
13506973769bSHong Zhang       bval = ba + bi[brow];
13516973769bSHong Zhang 
135281d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
13539371c9d4SSatish Balay       nexta = 0;
13549371c9d4SSatish Balay       nextb = 0;
135581d82fe4SHong Zhang       while (nexta < anzi && nextb < bnzj) {
13567b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
135781d82fe4SHong Zhang         if (nexta == anzi) break;
13587b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
135981d82fe4SHong Zhang         if (nextb == bnzj) break;
136081d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13616973769bSHong Zhang           cval[j] += aval[nexta] * bval[nextb];
13629371c9d4SSatish Balay           nexta++;
13639371c9d4SSatish Balay           nextb++;
136481d82fe4SHong Zhang           flops += 2;
13651fa1209cSHong Zhang         }
13661fa1209cSHong Zhang       }
136781d82fe4SHong Zhang     }
136881d82fe4SHong Zhang   }
13699566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
13709566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
13719566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13735df89d91SHong Zhang }
13745df89d91SHong Zhang 
1375d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1376d71ae5a4SJacob Faibussowitsch {
13776718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
13786d373c3eSHong Zhang 
13796d373c3eSHong Zhang   PetscFunctionBegin;
13809566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
13811baa6e33SBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
13829566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13846d373c3eSHong Zhang }
13856d373c3eSHong Zhang 
1386d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1387d71ae5a4SJacob Faibussowitsch {
1388089a957eSStefano Zampini   Mat          At      = NULL;
13894222ddf1SHong Zhang   Mat_Product *product = C->product;
1390089a957eSStefano Zampini   PetscBool    flg, def, square;
1391bc011b1eSHong Zhang 
1392bc011b1eSHong Zhang   PetscFunctionBegin;
1393089a957eSStefano Zampini   MatCheckProduct(C, 4);
1394b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
13954222ddf1SHong Zhang   /* outerproduct */
13969566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
13974222ddf1SHong Zhang   if (flg) {
1398bc011b1eSHong Zhang     /* create symbolic At */
1399089a957eSStefano Zampini     if (!square) {
14007fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A, &At));
14019566063dSJacob Faibussowitsch       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
14029566063dSJacob Faibussowitsch       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1403089a957eSStefano Zampini     }
1404bc011b1eSHong Zhang     /* get symbolic C=At*B */
14059566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14069566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1407bc011b1eSHong Zhang 
1408bc011b1eSHong Zhang     /* clean up */
140948a46eb9SPierre Jolivet     if (!square) PetscCall(MatDestroy(&At));
14106d373c3eSHong Zhang 
14114222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14129566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
14133ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14144222ddf1SHong Zhang   }
14154222ddf1SHong Zhang 
14164222ddf1SHong Zhang   /* matmatmult */
14179566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &def));
14189566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1419089a957eSStefano Zampini   if (flg || def) {
14204222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14214222ddf1SHong Zhang 
142228b400f6SJacob Faibussowitsch     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
14239566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
142448a46eb9SPierre Jolivet     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
14259566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14269566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
14279566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "at*b"));
14286718818eSStefano Zampini     product->data    = atb;
14296718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14304222ddf1SHong Zhang     atb->At          = At;
14314222ddf1SHong Zhang 
14324222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14333ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14344222ddf1SHong Zhang   }
14354222ddf1SHong Zhang 
14364222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1437bc011b1eSHong Zhang }
1438bc011b1eSHong Zhang 
1439d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1440d71ae5a4SJacob Faibussowitsch {
14410fbc74f4SHong Zhang   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1442d0f46423SBarry Smith   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1443d0f46423SBarry Smith   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1444d13dce4bSSatish Balay   PetscLogDouble flops = 0.0;
1445aa1aec99SHong Zhang   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1446bc011b1eSHong Zhang 
1447bc011b1eSHong Zhang   PetscFunctionBegin;
1448aa1aec99SHong Zhang   if (!c->a) {
14499566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
14502205254eSKarl Rupp 
1451aa1aec99SHong Zhang     c->a      = ca;
1452aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1453aa1aec99SHong Zhang   } else {
1454aa1aec99SHong Zhang     ca = c->a;
14559566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm]));
1456aa1aec99SHong Zhang   }
1457bc011b1eSHong Zhang 
1458bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1459bc011b1eSHong Zhang   for (i = 0; i < am; i++) {
1460bc011b1eSHong Zhang     bj   = b->j + bi[i];
1461bc011b1eSHong Zhang     ba   = b->a + bi[i];
1462bc011b1eSHong Zhang     bnzi = bi[i + 1] - bi[i];
1463bc011b1eSHong Zhang     anzi = ai[i + 1] - ai[i];
1464bc011b1eSHong Zhang     for (j = 0; j < anzi; j++) {
1465bc011b1eSHong Zhang       nextb = 0;
14660fbc74f4SHong Zhang       crow  = *aj++;
1467bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1468bc011b1eSHong Zhang       caj   = ca + ci[crow];
1469bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1470bc011b1eSHong Zhang       for (k = 0; nextb < bnzi; k++) {
14710fbc74f4SHong Zhang         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
14720fbc74f4SHong Zhang           caj[k] += (*aa) * (*(ba + nextb));
1473bc011b1eSHong Zhang           nextb++;
1474bc011b1eSHong Zhang         }
1475bc011b1eSHong Zhang       }
1476bc011b1eSHong Zhang       flops += 2 * bnzi;
14770fbc74f4SHong Zhang       aa++;
1478bc011b1eSHong Zhang     }
1479bc011b1eSHong Zhang   }
1480bc011b1eSHong Zhang 
1481bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14829566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
14839566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
14849566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
14853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1486bc011b1eSHong Zhang }
14879123193aSHong Zhang 
1488d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1489d71ae5a4SJacob Faibussowitsch {
14909123193aSHong Zhang   PetscFunctionBegin;
14919566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
14924222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14933ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14949123193aSHong Zhang }
14959123193aSHong Zhang 
1496d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1497d71ae5a4SJacob Faibussowitsch {
1498f32d5d43SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
14991ca9667aSStefano Zampini   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1500a4af7ca8SStefano Zampini   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1501daf57211SHong Zhang   const PetscInt    *aj;
150275f6d85dSStefano Zampini   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
150375f6d85dSStefano Zampini   PetscInt           clda;
150475f6d85dSStefano Zampini   PetscInt           am4, bm4, col, i, j, n;
15059123193aSHong Zhang 
15069123193aSHong Zhang   PetscFunctionBegin;
15073ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
15089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &av));
150993aa15f2SStefano Zampini   if (add) {
15109566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C, &c));
151193aa15f2SStefano Zampini   } else {
15129566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C, &c));
151393aa15f2SStefano Zampini   }
15149566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B, &b));
15159566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &bm));
15169566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C, &clda));
151775f6d85dSStefano Zampini   am4 = 4 * clda;
151875f6d85dSStefano Zampini   bm4 = 4 * bm;
15195c0db29aSPierre Jolivet   if (b) {
15209371c9d4SSatish Balay     b1 = b;
15219371c9d4SSatish Balay     b2 = b1 + bm;
15229371c9d4SSatish Balay     b3 = b2 + bm;
15239371c9d4SSatish Balay     b4 = b3 + bm;
15245c0db29aSPierre Jolivet   } else b1 = b2 = b3 = b4 = NULL;
15259371c9d4SSatish Balay   c1 = c;
15269371c9d4SSatish Balay   c2 = c1 + clda;
15279371c9d4SSatish Balay   c3 = c2 + clda;
15289371c9d4SSatish Balay   c4 = c3 + clda;
15291ca9667aSStefano Zampini   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
15301ca9667aSStefano Zampini     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1531f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1532f32d5d43SBarry Smith       n                 = a->i[i + 1] - a->i[i];
15338e3a54c0SPierre Jolivet       aj                = PetscSafePointerPlusOffset(a->j, a->i[i]);
15348e3a54c0SPierre Jolivet       aa                = PetscSafePointerPlusOffset(av, a->i[i]);
1535f32d5d43SBarry Smith       for (j = 0; j < n; j++) {
15361ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15371ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1538730858b9SHong Zhang         r1 += aatmp * b1[ajtmp];
1539730858b9SHong Zhang         r2 += aatmp * b2[ajtmp];
1540730858b9SHong Zhang         r3 += aatmp * b3[ajtmp];
1541730858b9SHong Zhang         r4 += aatmp * b4[ajtmp];
15429123193aSHong Zhang       }
154393aa15f2SStefano Zampini       if (add) {
154487753ddeSHong Zhang         c1[i] += r1;
154587753ddeSHong Zhang         c2[i] += r2;
154687753ddeSHong Zhang         c3[i] += r3;
154787753ddeSHong Zhang         c4[i] += r4;
154893aa15f2SStefano Zampini       } else {
154993aa15f2SStefano Zampini         c1[i] = r1;
155093aa15f2SStefano Zampini         c2[i] = r2;
155193aa15f2SStefano Zampini         c3[i] = r3;
155293aa15f2SStefano Zampini         c4[i] = r4;
155393aa15f2SStefano Zampini       }
1554f32d5d43SBarry Smith     }
15555c0db29aSPierre Jolivet     if (b) {
15569371c9d4SSatish Balay       b1 += bm4;
15579371c9d4SSatish Balay       b2 += bm4;
15589371c9d4SSatish Balay       b3 += bm4;
15599371c9d4SSatish Balay       b4 += bm4;
15605c0db29aSPierre Jolivet     }
15619371c9d4SSatish Balay     c1 += am4;
15629371c9d4SSatish Balay     c2 += am4;
15639371c9d4SSatish Balay     c3 += am4;
15649371c9d4SSatish Balay     c4 += am4;
1565f32d5d43SBarry Smith   }
156693aa15f2SStefano Zampini   /* process remaining columns */
156793aa15f2SStefano Zampini   if (col != cn) {
156893aa15f2SStefano Zampini     PetscInt rc = cn - col;
156993aa15f2SStefano Zampini 
157093aa15f2SStefano Zampini     if (rc == 1) {
157193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
1572f32d5d43SBarry Smith         r1 = 0.0;
1573f32d5d43SBarry Smith         n  = a->i[i + 1] - a->i[i];
15748e3a54c0SPierre Jolivet         aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
15758e3a54c0SPierre Jolivet         aa = PetscSafePointerPlusOffset(av, a->i[i]);
157693aa15f2SStefano Zampini         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
157793aa15f2SStefano Zampini         if (add) c1[i] += r1;
157893aa15f2SStefano Zampini         else c1[i] = r1;
157993aa15f2SStefano Zampini       }
158093aa15f2SStefano Zampini     } else if (rc == 2) {
158193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
158293aa15f2SStefano Zampini         r1 = r2 = 0.0;
158393aa15f2SStefano Zampini         n       = a->i[i + 1] - a->i[i];
15848e3a54c0SPierre Jolivet         aj      = PetscSafePointerPlusOffset(a->j, a->i[i]);
15858e3a54c0SPierre Jolivet         aa      = PetscSafePointerPlusOffset(av, a->i[i]);
1586f32d5d43SBarry Smith         for (j = 0; j < n; j++) {
158793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
158893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
158993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
159093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
1591f32d5d43SBarry Smith         }
159293aa15f2SStefano Zampini         if (add) {
159387753ddeSHong Zhang           c1[i] += r1;
159493aa15f2SStefano Zampini           c2[i] += r2;
159593aa15f2SStefano Zampini         } else {
159693aa15f2SStefano Zampini           c1[i] = r1;
159793aa15f2SStefano Zampini           c2[i] = r2;
1598f32d5d43SBarry Smith         }
159993aa15f2SStefano Zampini       }
160093aa15f2SStefano Zampini     } else {
160193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
160293aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
160393aa15f2SStefano Zampini         n            = a->i[i + 1] - a->i[i];
16048e3a54c0SPierre Jolivet         aj           = PetscSafePointerPlusOffset(a->j, a->i[i]);
16058e3a54c0SPierre Jolivet         aa           = PetscSafePointerPlusOffset(av, a->i[i]);
160693aa15f2SStefano Zampini         for (j = 0; j < n; j++) {
160793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
160893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
160993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
161093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
161193aa15f2SStefano Zampini           r3 += aatmp * b3[ajtmp];
161293aa15f2SStefano Zampini         }
161393aa15f2SStefano Zampini         if (add) {
161493aa15f2SStefano Zampini           c1[i] += r1;
161593aa15f2SStefano Zampini           c2[i] += r2;
161693aa15f2SStefano Zampini           c3[i] += r3;
161793aa15f2SStefano Zampini         } else {
161893aa15f2SStefano Zampini           c1[i] = r1;
161993aa15f2SStefano Zampini           c2[i] = r2;
162093aa15f2SStefano Zampini           c3[i] = r3;
162193aa15f2SStefano Zampini         }
162293aa15f2SStefano Zampini       }
162393aa15f2SStefano Zampini     }
1624f32d5d43SBarry Smith   }
16259566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
162693aa15f2SStefano Zampini   if (add) {
16279566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C, &c));
162893aa15f2SStefano Zampini   } else {
16299566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C, &c));
163093aa15f2SStefano Zampini   }
16319566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B, &b));
16329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
16333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1634f32d5d43SBarry Smith }
1635f32d5d43SBarry Smith 
1636d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1637d71ae5a4SJacob Faibussowitsch {
1638f32d5d43SBarry Smith   PetscFunctionBegin;
163908401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
164008401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
164108401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
16424324174eSBarry Smith 
16439566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
16443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16459123193aSHong Zhang }
1646b1683b59SHong Zhang 
1647d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1648d71ae5a4SJacob Faibussowitsch {
16494222ddf1SHong Zhang   PetscFunctionBegin;
16504222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16514222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16534222ddf1SHong Zhang }
16544222ddf1SHong Zhang 
16556718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
16566718818eSStefano Zampini 
1657d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1658d71ae5a4SJacob Faibussowitsch {
16594222ddf1SHong Zhang   PetscFunctionBegin;
16606718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16614222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16636718818eSStefano Zampini }
16646718818eSStefano Zampini 
1665d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1666d71ae5a4SJacob Faibussowitsch {
16676718818eSStefano Zampini   PetscFunctionBegin;
16686718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16696718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16714222ddf1SHong Zhang }
16724222ddf1SHong Zhang 
1673d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1674d71ae5a4SJacob Faibussowitsch {
16754222ddf1SHong Zhang   Mat_Product *product = C->product;
16764222ddf1SHong Zhang 
16774222ddf1SHong Zhang   PetscFunctionBegin;
16784222ddf1SHong Zhang   switch (product->type) {
1679d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
1680d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1681d71ae5a4SJacob Faibussowitsch     break;
1682d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
1683d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1684d71ae5a4SJacob Faibussowitsch     break;
1685d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
1686d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1687d71ae5a4SJacob Faibussowitsch     break;
1688d71ae5a4SJacob Faibussowitsch   default:
1689d71ae5a4SJacob Faibussowitsch     break;
16904222ddf1SHong Zhang   }
16913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16924222ddf1SHong Zhang }
16932ef1f0ffSBarry Smith 
1694d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1695d71ae5a4SJacob Faibussowitsch {
16964222ddf1SHong Zhang   Mat_Product *product = C->product;
16974222ddf1SHong Zhang   Mat          A       = product->A;
16984222ddf1SHong Zhang   PetscBool    baij;
16994222ddf1SHong Zhang 
17004222ddf1SHong Zhang   PetscFunctionBegin;
17019566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
17024222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
17034222ddf1SHong Zhang     PetscBool sbaij;
17049566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
170528b400f6SJacob Faibussowitsch     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
17064222ddf1SHong Zhang 
17074222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17084222ddf1SHong Zhang   } else { /* A is seqbaij */
17094222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17104222ddf1SHong Zhang   }
17114222ddf1SHong Zhang 
17124222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17144222ddf1SHong Zhang }
17154222ddf1SHong Zhang 
1716d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1717d71ae5a4SJacob Faibussowitsch {
17184222ddf1SHong Zhang   Mat_Product *product = C->product;
17194222ddf1SHong Zhang 
17204222ddf1SHong Zhang   PetscFunctionBegin;
17216718818eSStefano Zampini   MatCheckProduct(C, 1);
172228b400f6SJacob Faibussowitsch   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1723b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
17243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17254222ddf1SHong Zhang }
17266718818eSStefano Zampini 
1727d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1728d71ae5a4SJacob Faibussowitsch {
17294222ddf1SHong Zhang   PetscFunctionBegin;
17304222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17314222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17334222ddf1SHong Zhang }
17344222ddf1SHong Zhang 
1735d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1736d71ae5a4SJacob Faibussowitsch {
17374222ddf1SHong Zhang   Mat_Product *product = C->product;
17384222ddf1SHong Zhang 
17394222ddf1SHong Zhang   PetscFunctionBegin;
174048a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
17413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17424222ddf1SHong Zhang }
17434222ddf1SHong Zhang 
1744d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1745d71ae5a4SJacob Faibussowitsch {
17462f5f1f90SHong Zhang   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
17472f5f1f90SHong Zhang   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
17482f5f1f90SHong Zhang   PetscInt     *bi = b->i, *bj = b->j;
17492f5f1f90SHong Zhang   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
17502f5f1f90SHong Zhang   MatScalar    *btval, *btval_den, *ba = b->a;
17512f5f1f90SHong Zhang   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1752c8db22f6SHong Zhang 
1753c8db22f6SHong Zhang   PetscFunctionBegin;
17542f5f1f90SHong Zhang   btval_den = btdense->v;
17559566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den, m * n));
17562f5f1f90SHong Zhang   for (k = 0; k < ncolors; k++) {
17572f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17582f5f1f90SHong Zhang     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1759d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17602f5f1f90SHong Zhang       btcol = bj + bi[col];
17612f5f1f90SHong Zhang       btval = ba + bi[col];
17622f5f1f90SHong Zhang       anz   = bi[col + 1] - bi[col];
1763d2853540SHong Zhang       for (j = 0; j < anz; j++) {
17642f5f1f90SHong Zhang         brow            = btcol[j];
17652f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1766c8db22f6SHong Zhang       }
1767c8db22f6SHong Zhang     }
17682f5f1f90SHong Zhang     btval_den += m;
1769c8db22f6SHong Zhang   }
17703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1771c8db22f6SHong Zhang }
1772c8db22f6SHong Zhang 
1773d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1774d71ae5a4SJacob Faibussowitsch {
1775b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
17761683a169SBarry Smith   const PetscScalar *ca_den, *ca_den_ptr;
17771683a169SBarry Smith   PetscScalar       *ca = csp->a;
1778f99a636bSHong Zhang   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1779e88777f2SHong Zhang   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1780077f23c2SHong Zhang   PetscInt           nrows, *row, *idx;
1781077f23c2SHong Zhang   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
17828972f759SHong Zhang 
17838972f759SHong Zhang   PetscFunctionBegin;
17849566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1785f99a636bSHong Zhang 
1786077f23c2SHong Zhang   if (brows > 0) {
1787077f23c2SHong Zhang     PetscInt *lstart, row_end, row_start;
1788980ae229SHong Zhang     lstart = matcoloring->lstart;
17899566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart, ncolors));
1790eeb4fd02SHong Zhang 
1791077f23c2SHong Zhang     row_end = brows;
1792eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1793077f23c2SHong Zhang     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1794077f23c2SHong Zhang       ca_den_ptr = ca_den;
1795980ae229SHong Zhang       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1796eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1797eeb4fd02SHong Zhang         row   = rows + colorforrow[k];
1798eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1799eeb4fd02SHong Zhang         for (l = lstart[k]; l < nrows; l++) {
1800eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1801eeb4fd02SHong Zhang             lstart[k] = l;
1802eeb4fd02SHong Zhang             break;
1803eeb4fd02SHong Zhang           } else {
1804077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1805eeb4fd02SHong Zhang           }
1806eeb4fd02SHong Zhang         }
1807077f23c2SHong Zhang         ca_den_ptr += m;
1808eeb4fd02SHong Zhang       }
1809077f23c2SHong Zhang       row_end += brows;
1810eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1811eeb4fd02SHong Zhang     }
1812077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1813077f23c2SHong Zhang     ca_den_ptr = ca_den;
1814077f23c2SHong Zhang     for (k = 0; k < ncolors; k++) {
1815077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1816077f23c2SHong Zhang       row   = rows + colorforrow[k];
1817077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1818ad540459SPierre Jolivet       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1819077f23c2SHong Zhang       ca_den_ptr += m;
1820077f23c2SHong Zhang     }
1821f99a636bSHong Zhang   }
1822f99a636bSHong Zhang 
18239566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1824f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1825077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18269566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1827e88777f2SHong Zhang   } else {
18289566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1829e88777f2SHong Zhang   }
1830f99a636bSHong Zhang #endif
18313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18328972f759SHong Zhang }
18338972f759SHong Zhang 
1834d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1835d71ae5a4SJacob Faibussowitsch {
1836e88777f2SHong Zhang   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
18371a83f524SJed Brown   const PetscInt *is, *ci, *cj, *row_idx;
1838b28d80bdSHong Zhang   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1839b1683b59SHong Zhang   IS             *isa;
1840b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1841e88777f2SHong Zhang   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1842e88777f2SHong Zhang   PetscInt       *colorforcol, *columns, *columns_i, brows;
1843e88777f2SHong Zhang   PetscBool       flg;
1844b1683b59SHong Zhang 
1845b1683b59SHong Zhang   PetscFunctionBegin;
18469566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1847e99be685SHong Zhang 
18487ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1849e88777f2SHong Zhang   Nbs       = mat->cmap->N / bs;
1850b1683b59SHong Zhang   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1851e88777f2SHong Zhang   c->N      = Nbs;
1852e88777f2SHong Zhang   c->m      = c->M;
1853b1683b59SHong Zhang   c->rstart = 0;
1854e88777f2SHong Zhang   c->brows  = 100;
1855b1683b59SHong Zhang 
1856b1683b59SHong Zhang   c->ncolors = nis;
18579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
18589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
18599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1860e88777f2SHong Zhang 
1861e88777f2SHong Zhang   brows = c->brows;
18629566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1863e88777f2SHong Zhang   if (flg) c->brows = brows;
186448a46eb9SPierre Jolivet   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
18652205254eSKarl Rupp 
1866d2853540SHong Zhang   colorforrow[0] = 0;
1867d2853540SHong Zhang   rows_i         = rows;
1868f99a636bSHong Zhang   den2sp_i       = den2sp;
1869b1683b59SHong Zhang 
18709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
18719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs + 1, &columns));
18722205254eSKarl Rupp 
1873d2853540SHong Zhang   colorforcol[0] = 0;
1874d2853540SHong Zhang   columns_i      = columns;
1875d2853540SHong Zhang 
1876077f23c2SHong Zhang   /* get column-wise storage of mat */
18779566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1878b1683b59SHong Zhang 
1879b28d80bdSHong Zhang   cm = c->m;
18809566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &rowhit));
18819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1882f99a636bSHong Zhang   for (i = 0; i < nis; i++) { /* loop over color */
18839566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i], &n));
18849566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i], &is));
18852205254eSKarl Rupp 
1886b1683b59SHong Zhang     c->ncolumns[i] = n;
18871baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1888d2853540SHong Zhang     colorforcol[i + 1] = colorforcol[i] + n;
1889d2853540SHong Zhang     columns_i += n;
1890b1683b59SHong Zhang 
1891b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
18929566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit, cm));
1893e99be685SHong Zhang 
1894b7caf3d6SHong Zhang     for (j = 0; j < n; j++) { /* loop over columns*/
1895b1683b59SHong Zhang       col     = is[j];
1896d2853540SHong Zhang       row_idx = cj + ci[col];
1897b1683b59SHong Zhang       m       = ci[col + 1] - ci[col];
1898b7caf3d6SHong Zhang       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1899e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1900d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1901b1683b59SHong Zhang       }
1902b1683b59SHong Zhang     }
1903b1683b59SHong Zhang     /* count the number of hits */
1904b1683b59SHong Zhang     nrows = 0;
1905e8354b3eSHong Zhang     for (j = 0; j < cm; j++) {
1906b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1907b1683b59SHong Zhang     }
1908b1683b59SHong Zhang     c->nrows[i]        = nrows;
1909d2853540SHong Zhang     colorforrow[i + 1] = colorforrow[i] + nrows;
1910d2853540SHong Zhang 
1911b1683b59SHong Zhang     nrows = 0;
1912b7caf3d6SHong Zhang     for (j = 0; j < cm; j++) { /* loop over rows */
1913b1683b59SHong Zhang       if (rowhit[j]) {
1914d2853540SHong Zhang         rows_i[nrows]   = j;
191512b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1916b1683b59SHong Zhang         nrows++;
1917b1683b59SHong Zhang       }
1918b1683b59SHong Zhang     }
1919e88777f2SHong Zhang     den2sp_i += nrows;
1920077f23c2SHong Zhang 
19219566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i], &is));
1922f99a636bSHong Zhang     rows_i += nrows;
1923b1683b59SHong Zhang   }
19249566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
19259566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
19269566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
19272c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1928b28d80bdSHong Zhang 
1929d2853540SHong Zhang   c->colorforrow = colorforrow;
1930d2853540SHong Zhang   c->rows        = rows;
1931f99a636bSHong Zhang   c->den2sp      = den2sp;
1932d2853540SHong Zhang   c->colorforcol = colorforcol;
1933d2853540SHong Zhang   c->columns     = columns;
19342205254eSKarl Rupp 
19359566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
19363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1937b1683b59SHong Zhang }
1938d013fc79Sandi selinger 
1939d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1940d71ae5a4SJacob Faibussowitsch {
19414222ddf1SHong Zhang   Mat_Product *product = C->product;
19424222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19434222ddf1SHong Zhang 
1944df97dc6dSFande Kong   PetscFunctionBegin;
19454222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19464222ddf1SHong Zhang     /* Alg: "outerproduct" */
19479566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
19484222ddf1SHong Zhang   } else {
19494222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19506718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19514222ddf1SHong Zhang 
195228b400f6SJacob Faibussowitsch     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
19531cdffd5eSHong Zhang     if (atb->At) {
19541cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
19551cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19561cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A, atb->At));
19577fb60732SBarry Smith       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
19584222ddf1SHong Zhang     }
19597fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
19604222ddf1SHong Zhang   }
19613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1962df97dc6dSFande Kong }
1963df97dc6dSFande Kong 
1964d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1965d71ae5a4SJacob Faibussowitsch {
19664222ddf1SHong Zhang   Mat_Product *product = C->product;
19674222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19684222ddf1SHong Zhang   PetscReal    fill = product->fill;
1969d013fc79Sandi selinger 
1970d013fc79Sandi selinger   PetscFunctionBegin;
19719566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
19722869b61bSandi selinger 
19734222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
19752869b61bSandi selinger }
1976d013fc79Sandi selinger 
1977d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1978d71ae5a4SJacob Faibussowitsch {
19794222ddf1SHong Zhang   Mat_Product *product = C->product;
19804222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm */
19814222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
19824222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19834222ddf1SHong Zhang   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
19844222ddf1SHong Zhang   PetscInt    nalg        = 7;
19854222ddf1SHong Zhang #else
19864222ddf1SHong Zhang   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
19874222ddf1SHong Zhang   PetscInt    nalg        = 8;
19884222ddf1SHong Zhang #endif
19894222ddf1SHong Zhang 
19904222ddf1SHong Zhang   PetscFunctionBegin;
19914222ddf1SHong Zhang   /* Set default algorithm */
19929566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
199348a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
1994d013fc79Sandi selinger 
19954222ddf1SHong Zhang   /* Get runtime option */
19964222ddf1SHong Zhang   if (product->api_user) {
1997d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
19989566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
1999d0609cedSBarry Smith     PetscOptionsEnd();
20004222ddf1SHong Zhang   } else {
2001d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
20029566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2003d0609cedSBarry Smith     PetscOptionsEnd();
2004d013fc79Sandi selinger   }
200548a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2006d013fc79Sandi selinger 
20074222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20084222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20104222ddf1SHong Zhang }
2011d013fc79Sandi selinger 
2012d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2013d71ae5a4SJacob Faibussowitsch {
20144222ddf1SHong Zhang   Mat_Product *product     = C->product;
20154222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20164222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
2017089a957eSStefano Zampini   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2018089a957eSStefano Zampini   PetscInt     nalg        = 3;
2019d013fc79Sandi selinger 
20204222ddf1SHong Zhang   PetscFunctionBegin;
20214222ddf1SHong Zhang   /* Get runtime option */
20224222ddf1SHong Zhang   if (product->api_user) {
2023d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
20249566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2025d0609cedSBarry Smith     PetscOptionsEnd();
20264222ddf1SHong Zhang   } else {
2027d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
20289566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2029d0609cedSBarry Smith     PetscOptionsEnd();
20304222ddf1SHong Zhang   }
203148a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2032d013fc79Sandi selinger 
20334222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20354222ddf1SHong Zhang }
20364222ddf1SHong Zhang 
2037d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2038d71ae5a4SJacob Faibussowitsch {
20394222ddf1SHong Zhang   Mat_Product *product     = C->product;
20404222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20414222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20424222ddf1SHong Zhang   const char  *algTypes[2] = {"default", "color"};
20434222ddf1SHong Zhang   PetscInt     nalg        = 2;
20444222ddf1SHong Zhang 
20454222ddf1SHong Zhang   PetscFunctionBegin;
20464222ddf1SHong Zhang   /* Set default algorithm */
20479566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
20484222ddf1SHong Zhang   if (!flg) {
20494222ddf1SHong Zhang     alg = 1;
20509566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20514222ddf1SHong Zhang   }
20524222ddf1SHong Zhang 
20534222ddf1SHong Zhang   /* Get runtime option */
20544222ddf1SHong Zhang   if (product->api_user) {
2055d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
20569566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2057d0609cedSBarry Smith     PetscOptionsEnd();
20584222ddf1SHong Zhang   } else {
2059d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
20609566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2061d0609cedSBarry Smith     PetscOptionsEnd();
20624222ddf1SHong Zhang   }
206348a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20644222ddf1SHong Zhang 
20654222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20664222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20684222ddf1SHong Zhang }
20694222ddf1SHong Zhang 
2070d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2071d71ae5a4SJacob Faibussowitsch {
20724222ddf1SHong Zhang   Mat_Product *product = C->product;
20734222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
20744222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
20754222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20764222ddf1SHong Zhang   const char *algTypes[2] = {"scalable", "rap"};
20774222ddf1SHong Zhang   PetscInt    nalg        = 2;
20784222ddf1SHong Zhang #else
20794222ddf1SHong Zhang   const char *algTypes[3] = {"scalable", "rap", "hypre"};
20804222ddf1SHong Zhang   PetscInt    nalg        = 3;
20814222ddf1SHong Zhang #endif
20824222ddf1SHong Zhang 
20834222ddf1SHong Zhang   PetscFunctionBegin;
20844222ddf1SHong Zhang   /* Set default algorithm */
20859566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
208648a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20874222ddf1SHong Zhang 
20884222ddf1SHong Zhang   /* Get runtime option */
20894222ddf1SHong Zhang   if (product->api_user) {
2090d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
20919566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2092d0609cedSBarry Smith     PetscOptionsEnd();
20934222ddf1SHong Zhang   } else {
2094d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
20959566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2096d0609cedSBarry Smith     PetscOptionsEnd();
20974222ddf1SHong Zhang   }
209848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20994222ddf1SHong Zhang 
21004222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21024222ddf1SHong Zhang }
21034222ddf1SHong Zhang 
2104d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2105d71ae5a4SJacob Faibussowitsch {
21064222ddf1SHong Zhang   Mat_Product *product     = C->product;
21074222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21084222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21094222ddf1SHong Zhang   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
21104222ddf1SHong Zhang   PetscInt     nalg        = 3;
21114222ddf1SHong Zhang 
21124222ddf1SHong Zhang   PetscFunctionBegin;
21134222ddf1SHong Zhang   /* Set default algorithm */
21149566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
211548a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21164222ddf1SHong Zhang 
21174222ddf1SHong Zhang   /* Get runtime option */
21184222ddf1SHong Zhang   if (product->api_user) {
2119d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
21209566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2121d0609cedSBarry Smith     PetscOptionsEnd();
21224222ddf1SHong Zhang   } else {
2123d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
21249566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2125d0609cedSBarry Smith     PetscOptionsEnd();
21264222ddf1SHong Zhang   }
212748a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21284222ddf1SHong Zhang 
21294222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21314222ddf1SHong Zhang }
21324222ddf1SHong Zhang 
21334222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2134d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2135d71ae5a4SJacob Faibussowitsch {
21364222ddf1SHong Zhang   Mat_Product *product     = C->product;
21374222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21384222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21394222ddf1SHong Zhang   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
21404222ddf1SHong Zhang   PetscInt     nalg        = 7;
21414222ddf1SHong Zhang 
21424222ddf1SHong Zhang   PetscFunctionBegin;
21434222ddf1SHong Zhang   /* Set default algorithm */
21449566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
214548a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21464222ddf1SHong Zhang 
21474222ddf1SHong Zhang   /* Get runtime option */
21484222ddf1SHong Zhang   if (product->api_user) {
2149d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
21509566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2151d0609cedSBarry Smith     PetscOptionsEnd();
21524222ddf1SHong Zhang   } else {
2153d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
21549566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2155d0609cedSBarry Smith     PetscOptionsEnd();
21564222ddf1SHong Zhang   }
215748a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21584222ddf1SHong Zhang 
21594222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21604222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21624222ddf1SHong Zhang }
21634222ddf1SHong Zhang 
2164d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2165d71ae5a4SJacob Faibussowitsch {
21664222ddf1SHong Zhang   Mat_Product *product = C->product;
21674222ddf1SHong Zhang 
21684222ddf1SHong Zhang   PetscFunctionBegin;
21694222ddf1SHong Zhang   switch (product->type) {
2170d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
2171d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2172d71ae5a4SJacob Faibussowitsch     break;
2173d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
2174d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2175d71ae5a4SJacob Faibussowitsch     break;
2176d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
2177d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2178d71ae5a4SJacob Faibussowitsch     break;
2179d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_PtAP:
2180d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2181d71ae5a4SJacob Faibussowitsch     break;
2182d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_RARt:
2183d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2184d71ae5a4SJacob Faibussowitsch     break;
2185d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABC:
2186d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2187d71ae5a4SJacob Faibussowitsch     break;
2188d71ae5a4SJacob Faibussowitsch   default:
2189d71ae5a4SJacob Faibussowitsch     break;
21904222ddf1SHong Zhang   }
21913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2192d013fc79Sandi selinger }
2193