xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 5c0db29a52f0e0a315a431c2b5d53138ecf7c054)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
14d71ae5a4SJacob Faibussowitsch {
15df97dc6dSFande Kong   PetscFunctionBegin;
16dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
17dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
19df97dc6dSFande Kong }
20df97dc6dSFande Kong 
214222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
22d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
23d71ae5a4SJacob Faibussowitsch {
244222ddf1SHong Zhang   PetscInt    ii;
254222ddf1SHong Zhang   Mat_SeqAIJ *aij;
26cbc6b225SStefano Zampini   PetscBool   isseqaij, osingle, ofree_a, ofree_ij;
275c66b693SKris Buschelman 
285c66b693SKris Buschelman   PetscFunctionBegin;
29aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
309566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat, m, n, m, n));
314222ddf1SHong Zhang 
32e4e71118SStefano Zampini   if (!mtype) {
339566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
349566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
35e4e71118SStefano Zampini   } else {
369566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat, mtype));
37e4e71118SStefano Zampini   }
38cbc6b225SStefano Zampini 
394222ddf1SHong Zhang   aij      = (Mat_SeqAIJ *)(mat)->data;
40cbc6b225SStefano Zampini   osingle  = aij->singlemalloc;
41cbc6b225SStefano Zampini   ofree_a  = aij->free_a;
42cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
43cbc6b225SStefano Zampini   /* changes the free flags */
449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
45cbc6b225SStefano Zampini 
469566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
479566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->imax));
499566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->ilen));
50cbc6b225SStefano Zampini   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
51cbc6b225SStefano Zampini     const PetscInt rnz = i[ii + 1] - i[ii];
52cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
53cbc6b225SStefano Zampini     aij->rmax     = PetscMax(aij->rmax, rnz);
54cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
55cbc6b225SStefano Zampini   }
56cbc6b225SStefano Zampini   aij->maxnz = i[m];
57cbc6b225SStefano Zampini   aij->nz    = i[m];
584222ddf1SHong Zhang 
59cbc6b225SStefano Zampini   if (osingle) {
609566063dSJacob Faibussowitsch     PetscCall(PetscFree3(aij->a, aij->j, aij->i));
61cbc6b225SStefano Zampini   } else {
629566063dSJacob Faibussowitsch     if (ofree_a) PetscCall(PetscFree(aij->a));
639566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->j));
649566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->i));
65cbc6b225SStefano Zampini   }
664222ddf1SHong Zhang   aij->i     = i;
674222ddf1SHong Zhang   aij->j     = j;
684222ddf1SHong Zhang   aij->a     = a;
694222ddf1SHong Zhang   aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
70cbc6b225SStefano Zampini   /* default to not retain ownership */
71cbc6b225SStefano Zampini   aij->singlemalloc = PETSC_FALSE;
724222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
734222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
749566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
765c66b693SKris Buschelman }
771c24bd37SHong Zhang 
78d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
79d71ae5a4SJacob Faibussowitsch {
804222ddf1SHong Zhang   Mat_Product        *product = C->product;
814222ddf1SHong Zhang   MatProductAlgorithm alg;
824222ddf1SHong Zhang   PetscBool           flg;
834222ddf1SHong Zhang 
844222ddf1SHong Zhang   PetscFunctionBegin;
854222ddf1SHong Zhang   if (product) {
864222ddf1SHong Zhang     alg = product->alg;
874222ddf1SHong Zhang   } else {
884222ddf1SHong Zhang     alg = "sorted";
894222ddf1SHong Zhang   }
904222ddf1SHong Zhang   /* sorted */
919566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "sorted", &flg));
924222ddf1SHong Zhang   if (flg) {
939566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
943ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
954222ddf1SHong Zhang   }
964222ddf1SHong Zhang 
974222ddf1SHong Zhang   /* scalable */
989566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
994222ddf1SHong Zhang   if (flg) {
1009566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
1013ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1024222ddf1SHong Zhang   }
1034222ddf1SHong Zhang 
1044222ddf1SHong Zhang   /* scalable_fast */
1059566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
1064222ddf1SHong Zhang   if (flg) {
1079566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
1083ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1094222ddf1SHong Zhang   }
1104222ddf1SHong Zhang 
1114222ddf1SHong Zhang   /* heap */
1129566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "heap", &flg));
1134222ddf1SHong Zhang   if (flg) {
1149566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
1153ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1164222ddf1SHong Zhang   }
1174222ddf1SHong Zhang 
1184222ddf1SHong Zhang   /* btheap */
1199566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "btheap", &flg));
1204222ddf1SHong Zhang   if (flg) {
1219566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
1223ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1234222ddf1SHong Zhang   }
1244222ddf1SHong Zhang 
1254222ddf1SHong Zhang   /* llcondensed */
1269566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
1274222ddf1SHong Zhang   if (flg) {
1289566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
1293ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1304222ddf1SHong Zhang   }
1314222ddf1SHong Zhang 
1324222ddf1SHong Zhang   /* rowmerge */
1339566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
1344222ddf1SHong Zhang   if (flg) {
1359566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
1363ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1374222ddf1SHong Zhang   }
1384222ddf1SHong Zhang 
1394222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1409566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
1414222ddf1SHong Zhang   if (flg) {
1429566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
1433ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1444222ddf1SHong Zhang   }
1454222ddf1SHong Zhang #endif
1464222ddf1SHong Zhang 
1474222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1484222ddf1SHong Zhang }
1494222ddf1SHong Zhang 
150d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
151d71ae5a4SJacob Faibussowitsch {
152b561aa9dSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
153971236abSHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
154c1ba5575SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
155b561aa9dSHong Zhang   PetscReal          afill;
156eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
157eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
158fb908031SHong Zhang   PetscBT            lnkbt;
1590298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
160b561aa9dSHong Zhang 
161b561aa9dSHong Zhang   PetscFunctionBegin;
162bd958071SHong Zhang   /* Get ci and cj */
163fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
164fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
166fb908031SHong Zhang   ci[0] = 0;
167fb908031SHong Zhang 
168fb908031SHong Zhang   /* create and initialize a linked list */
169eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
170c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
171eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
172eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
173eca6b86aSHong Zhang 
1749566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
175fb908031SHong Zhang 
176fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1779566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
1782205254eSKarl Rupp 
179fb908031SHong Zhang   current_space = free_space;
180fb908031SHong Zhang 
181fb908031SHong Zhang   /* Determine ci and cj */
182fb908031SHong Zhang   for (i = 0; i < am; i++) {
183fb908031SHong Zhang     anzi = ai[i + 1] - ai[i];
184fb908031SHong Zhang     aj   = a->j + ai[i];
185fb908031SHong Zhang     for (j = 0; j < anzi; j++) {
186fb908031SHong Zhang       brow = aj[j];
187fb908031SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
188fb908031SHong Zhang       bj   = b->j + bi[brow];
189fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1909566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
191fb908031SHong Zhang     }
1928c78258cSHong Zhang     /* add possible missing diagonal entry */
19348a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
194fb908031SHong Zhang     cnzi = lnk[0];
195fb908031SHong Zhang 
196fb908031SHong Zhang     /* If free space is not available, make more free space */
197fb908031SHong Zhang     /* Double the amount of total space in the list */
198fb908031SHong Zhang     if (current_space->local_remaining < cnzi) {
1999566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
200fb908031SHong Zhang       ndouble++;
201fb908031SHong Zhang     }
202fb908031SHong Zhang 
203fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
2049566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
2052205254eSKarl Rupp 
206fb908031SHong Zhang     current_space->array += cnzi;
207fb908031SHong Zhang     current_space->local_used += cnzi;
208fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2092205254eSKarl Rupp 
210fb908031SHong Zhang     ci[i + 1] = ci[i] + cnzi;
211fb908031SHong Zhang   }
212fb908031SHong Zhang 
213fb908031SHong Zhang   /* Column indices are in the list of free space */
214fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
215fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2169566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
2179566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
2189566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
219b561aa9dSHong Zhang 
22026be0446SHong Zhang   /* put together the new symbolic matrix */
2219566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
2229566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
22358c24d83SHong Zhang 
22458c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
22558c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2264222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
227aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
228e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22958c24d83SHong Zhang   c->nonew   = 0;
2304222ddf1SHong Zhang 
2314222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2324222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2330b7e3e3dSHong Zhang 
2347212da7cSHong Zhang   /* set MatInfo */
2357212da7cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
236f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2374222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2384222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2394222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2407212da7cSHong Zhang 
2417212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2427212da7cSHong Zhang   if (ci[am]) {
2439566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
2449566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
245f2b054eeSHong Zhang   } else {
2469566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
247be0fcf8dSHong Zhang   }
248f2b054eeSHong Zhang #endif
2493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25058c24d83SHong Zhang }
251d50806bdSBarry Smith 
252d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
253d71ae5a4SJacob Faibussowitsch {
254d13dce4bSSatish Balay   PetscLogDouble     flops = 0.0;
255d50806bdSBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
256d50806bdSBarry Smith   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
257d50806bdSBarry Smith   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
25838baddfdSBarry Smith   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
259c58ca2e3SHong Zhang   PetscInt           am = A->rmap->n, cm = C->rmap->n;
260519eb980SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
2612e5835c6SStefano Zampini   PetscScalar       *ca, valtmp;
262aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2636718818eSStefano Zampini   PetscContainer     cab_dense;
2642e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
265d50806bdSBarry Smith 
266d50806bdSBarry Smith   PetscFunctionBegin;
2679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
2689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
269aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2709566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
271aa1aec99SHong Zhang     c->a      = ca;
272aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2736718818eSStefano Zampini   } else ca = c->a;
2746718818eSStefano Zampini 
2756718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2766718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2776718818eSStefano Zampini      that is hard to eradicate) */
2789566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
2796718818eSStefano Zampini   if (!cab_dense) {
2809566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
2819566063dSJacob Faibussowitsch     PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense));
2829566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetPointer(cab_dense, ab_dense));
2839566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault));
2849566063dSJacob Faibussowitsch     PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense));
2859566063dSJacob Faibussowitsch     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
286d90d86d0SMatthew G. Knepley   }
2879566063dSJacob Faibussowitsch   PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
2889566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
289aa1aec99SHong Zhang 
290c124e916SHong Zhang   /* clean old values in C */
2919566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
292d50806bdSBarry Smith   /* Traverse A row-wise. */
293d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
294d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
295d50806bdSBarry Smith   for (i = 0; i < am; i++) {
296d50806bdSBarry Smith     anzi = ai[i + 1] - ai[i];
297d50806bdSBarry Smith     for (j = 0; j < anzi; j++) {
298519eb980SHong Zhang       brow = aj[j];
299d50806bdSBarry Smith       bnzi = bi[brow + 1] - bi[brow];
300d50806bdSBarry Smith       bjj  = bj + bi[brow];
301d50806bdSBarry Smith       baj  = ba + bi[brow];
302519eb980SHong Zhang       /* perform dense axpy */
30336ec6d2dSHong Zhang       valtmp = aa[j];
304ad540459SPierre Jolivet       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
305519eb980SHong Zhang       flops += 2 * bnzi;
306519eb980SHong Zhang     }
3079371c9d4SSatish Balay     aj += anzi;
3089371c9d4SSatish Balay     aa += anzi;
309c58ca2e3SHong Zhang 
310c58ca2e3SHong Zhang     cnzi = ci[i + 1] - ci[i];
311519eb980SHong Zhang     for (k = 0; k < cnzi; k++) {
312519eb980SHong Zhang       ca[k] += ab_dense[cj[k]];
313519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
314519eb980SHong Zhang     }
315519eb980SHong Zhang     flops += cnzi;
3169371c9d4SSatish Balay     cj += cnzi;
3179371c9d4SSatish Balay     ca += cnzi;
318519eb980SHong Zhang   }
3192e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3202e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3212e5835c6SStefano Zampini #endif
3229566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3239566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3249566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
328c58ca2e3SHong Zhang }
329c58ca2e3SHong Zhang 
330d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
331d71ae5a4SJacob Faibussowitsch {
332c58ca2e3SHong Zhang   PetscLogDouble     flops = 0.0;
333c58ca2e3SHong Zhang   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
334c58ca2e3SHong Zhang   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
335c58ca2e3SHong Zhang   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
336c58ca2e3SHong Zhang   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
337c58ca2e3SHong Zhang   PetscInt           am = A->rmap->N, cm = C->rmap->N;
338c58ca2e3SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
3392e5835c6SStefano Zampini   PetscScalar       *ca = c->a, valtmp;
3402e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
341c58ca2e3SHong Zhang   PetscInt           nextb;
342c58ca2e3SHong Zhang 
343c58ca2e3SHong Zhang   PetscFunctionBegin;
3449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
3459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
346cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3479566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
348cf742fccSHong Zhang     c->a      = ca;
349cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
350cf742fccSHong Zhang   }
351cf742fccSHong Zhang 
352c58ca2e3SHong Zhang   /* clean old values in C */
3539566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
354c58ca2e3SHong Zhang   /* Traverse A row-wise. */
355c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
356c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
357519eb980SHong Zhang   for (i = 0; i < am; i++) {
358519eb980SHong Zhang     anzi = ai[i + 1] - ai[i];
359519eb980SHong Zhang     cnzi = ci[i + 1] - ci[i];
360519eb980SHong Zhang     for (j = 0; j < anzi; j++) {
361519eb980SHong Zhang       brow = aj[j];
362519eb980SHong Zhang       bnzi = bi[brow + 1] - bi[brow];
363519eb980SHong Zhang       bjj  = bj + bi[brow];
364519eb980SHong Zhang       baj  = ba + bi[brow];
365519eb980SHong Zhang       /* perform sparse axpy */
36636ec6d2dSHong Zhang       valtmp = aa[j];
367c124e916SHong Zhang       nextb  = 0;
368c124e916SHong Zhang       for (k = 0; nextb < bnzi; k++) {
369c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
37036ec6d2dSHong Zhang           ca[k] += valtmp * baj[nextb++];
371c124e916SHong Zhang         }
372d50806bdSBarry Smith       }
373d50806bdSBarry Smith       flops += 2 * bnzi;
374d50806bdSBarry Smith     }
3759371c9d4SSatish Balay     aj += anzi;
3769371c9d4SSatish Balay     aa += anzi;
3779371c9d4SSatish Balay     cj += cnzi;
3789371c9d4SSatish Balay     ca += cnzi;
379d50806bdSBarry Smith   }
3802e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3812e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3822e5835c6SStefano Zampini #endif
3839566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3849566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
389d50806bdSBarry Smith }
390bc011b1eSHong Zhang 
391d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
392d71ae5a4SJacob Faibussowitsch {
39325296bd5SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
39425296bd5SBarry Smith   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
3953c50cad2SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
39625296bd5SBarry Smith   MatScalar         *ca;
39725296bd5SBarry Smith   PetscReal          afill;
398eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
399eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
4000298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
40125296bd5SBarry Smith 
40225296bd5SBarry Smith   PetscFunctionBegin;
4033c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
4043c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4059566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
4063c50cad2SHong Zhang   ci[0] = 0;
4073c50cad2SHong Zhang 
4083c50cad2SHong Zhang   /* create and initialize a linked list */
409eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
410c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
411eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
412eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
413eca6b86aSHong Zhang 
4149566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
4153c50cad2SHong Zhang 
4163c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4179566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
4183c50cad2SHong Zhang   current_space = free_space;
4193c50cad2SHong Zhang 
4203c50cad2SHong Zhang   /* Determine ci and cj */
4213c50cad2SHong Zhang   for (i = 0; i < am; i++) {
4223c50cad2SHong Zhang     anzi = ai[i + 1] - ai[i];
4233c50cad2SHong Zhang     aj   = a->j + ai[i];
4243c50cad2SHong Zhang     for (j = 0; j < anzi; j++) {
4253c50cad2SHong Zhang       brow = aj[j];
4263c50cad2SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
4273c50cad2SHong Zhang       bj   = b->j + bi[brow];
4283c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4299566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
4303c50cad2SHong Zhang     }
4318c78258cSHong Zhang     /* add possible missing diagonal entry */
43248a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
4333c50cad2SHong Zhang     cnzi = lnk[1];
4343c50cad2SHong Zhang 
4353c50cad2SHong Zhang     /* If free space is not available, make more free space */
4363c50cad2SHong Zhang     /* Double the amount of total space in the list */
4373c50cad2SHong Zhang     if (current_space->local_remaining < cnzi) {
4389566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
4393c50cad2SHong Zhang       ndouble++;
4403c50cad2SHong Zhang     }
4413c50cad2SHong Zhang 
4423c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4439566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
4442205254eSKarl Rupp 
4453c50cad2SHong Zhang     current_space->array += cnzi;
4463c50cad2SHong Zhang     current_space->local_used += cnzi;
4473c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4482205254eSKarl Rupp 
4493c50cad2SHong Zhang     ci[i + 1] = ci[i] + cnzi;
4503c50cad2SHong Zhang   }
4513c50cad2SHong Zhang 
4523c50cad2SHong Zhang   /* Column indices are in the list of free space */
4533c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4543c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4559566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
4569566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
4579566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
45825296bd5SBarry Smith 
45925296bd5SBarry Smith   /* Allocate space for ca */
4609566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
46125296bd5SBarry Smith 
46225296bd5SBarry Smith   /* put together the new symbolic matrix */
4639566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
4649566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
46525296bd5SBarry Smith 
46625296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
46725296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4684222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
46925296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
47025296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
47125296bd5SBarry Smith   c->nonew   = 0;
4722205254eSKarl Rupp 
4734222ddf1SHong Zhang   /* slower, less memory */
4744222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47525296bd5SBarry Smith 
47625296bd5SBarry Smith   /* set MatInfo */
47725296bd5SBarry Smith   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
47825296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4794222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4804222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4814222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
48225296bd5SBarry Smith 
48325296bd5SBarry Smith #if defined(PETSC_USE_INFO)
48425296bd5SBarry Smith   if (ci[am]) {
4859566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
4869566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
48725296bd5SBarry Smith   } else {
4889566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
48925296bd5SBarry Smith   }
49025296bd5SBarry Smith #endif
4913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
49225296bd5SBarry Smith }
49325296bd5SBarry Smith 
494d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
495d71ae5a4SJacob Faibussowitsch {
496e9e4536cSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
497bf9555e6SHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
49825c41797SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
499e9e4536cSHong Zhang   MatScalar         *ca;
500e9e4536cSHong Zhang   PetscReal          afill;
501eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
502eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
5030298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
504e9e4536cSHong Zhang 
505e9e4536cSHong Zhang   PetscFunctionBegin;
506bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
507bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
5089566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
509bd958071SHong Zhang   ci[0] = 0;
510bd958071SHong Zhang 
511bd958071SHong Zhang   /* create and initialize a linked list */
512eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
513c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
514eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
515eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
5169566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
517bd958071SHong Zhang 
518bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5199566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
520bd958071SHong Zhang   current_space = free_space;
521bd958071SHong Zhang 
522bd958071SHong Zhang   /* Determine ci and cj */
523bd958071SHong Zhang   for (i = 0; i < am; i++) {
524bd958071SHong Zhang     anzi = ai[i + 1] - ai[i];
525bd958071SHong Zhang     aj   = a->j + ai[i];
526bd958071SHong Zhang     for (j = 0; j < anzi; j++) {
527bd958071SHong Zhang       brow = aj[j];
528bd958071SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
529bd958071SHong Zhang       bj   = b->j + bi[brow];
530bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5319566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
532bd958071SHong Zhang     }
5338c78258cSHong Zhang     /* add possible missing diagonal entry */
53448a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
5358c78258cSHong Zhang 
536bd958071SHong Zhang     cnzi = lnk[0];
537bd958071SHong Zhang 
538bd958071SHong Zhang     /* If free space is not available, make more free space */
539bd958071SHong Zhang     /* Double the amount of total space in the list */
540bd958071SHong Zhang     if (current_space->local_remaining < cnzi) {
5419566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
542bd958071SHong Zhang       ndouble++;
543bd958071SHong Zhang     }
544bd958071SHong Zhang 
545bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5469566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
5472205254eSKarl Rupp 
548bd958071SHong Zhang     current_space->array += cnzi;
549bd958071SHong Zhang     current_space->local_used += cnzi;
550bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5512205254eSKarl Rupp 
552bd958071SHong Zhang     ci[i + 1] = ci[i] + cnzi;
553bd958071SHong Zhang   }
554bd958071SHong Zhang 
555bd958071SHong Zhang   /* Column indices are in the list of free space */
556bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
557bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
5599566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
5609566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
561e9e4536cSHong Zhang 
562e9e4536cSHong Zhang   /* Allocate space for ca */
5639566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
564e9e4536cSHong Zhang 
565e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5669566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
5679566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
568e9e4536cSHong Zhang 
569e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
570e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5714222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
572e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
573e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
574e9e4536cSHong Zhang   c->nonew   = 0;
5752205254eSKarl Rupp 
5764222ddf1SHong Zhang   /* slower, less memory */
5774222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
578e9e4536cSHong Zhang 
579e9e4536cSHong Zhang   /* set MatInfo */
580e9e4536cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
581e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5824222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5834222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5844222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
585e9e4536cSHong Zhang 
586e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
587e9e4536cSHong Zhang   if (ci[am]) {
5889566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
5899566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
590e9e4536cSHong Zhang   } else {
5919566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
592e9e4536cSHong Zhang   }
593e9e4536cSHong Zhang #endif
5943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
595e9e4536cSHong Zhang }
596e9e4536cSHong Zhang 
597d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
598d71ae5a4SJacob Faibussowitsch {
5990ced3a2bSJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
6000ced3a2bSJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
6010ced3a2bSJed Brown   PetscInt          *ci, *cj, *bb;
6020ced3a2bSJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
6030ced3a2bSJed Brown   PetscReal          afill;
6040ced3a2bSJed Brown   PetscInt           i, j, col, ndouble = 0;
6050298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
6060ced3a2bSJed Brown   PetscHeap          h;
6070ced3a2bSJed Brown 
6080ced3a2bSJed Brown   PetscFunctionBegin;
609cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6100ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6119566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
6120ced3a2bSJed Brown   ci[0] = 0;
6130ced3a2bSJed Brown 
6140ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6159566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
6160ced3a2bSJed Brown   current_space = free_space;
6170ced3a2bSJed Brown 
6189566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
6199566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
6200ced3a2bSJed Brown 
6210ced3a2bSJed Brown   /* Determine ci and cj */
6220ced3a2bSJed Brown   for (i = 0; i < am; i++) {
6230ced3a2bSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6240ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
6250ced3a2bSJed Brown     ci[i + 1]            = ci[i];
6260ced3a2bSJed Brown     /* Populate the min heap */
6270ced3a2bSJed Brown     for (j = 0; j < anzi; j++) {
6280ced3a2bSJed Brown       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
6290ced3a2bSJed Brown       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
6309566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
6310ced3a2bSJed Brown       }
6320ced3a2bSJed Brown     }
6330ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6349566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
6350ced3a2bSJed Brown     while (j >= 0) {
6360ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6379566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
6380ced3a2bSJed Brown         ndouble++;
6390ced3a2bSJed Brown       }
6400ced3a2bSJed Brown       *(current_space->array++) = col;
6410ced3a2bSJed Brown       current_space->local_used++;
6420ced3a2bSJed Brown       current_space->local_remaining--;
6430ced3a2bSJed Brown       ci[i + 1]++;
6440ced3a2bSJed Brown 
6450ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6469566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
6470ced3a2bSJed Brown       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
6480ced3a2bSJed Brown         PetscInt j2, col2;
6499566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h, &j2, &col2));
6500ced3a2bSJed Brown         if (col2 != col) break;
6519566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h, &j2, &col2));
6529566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
6530ced3a2bSJed Brown       }
6540ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6559566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6569566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
6570ced3a2bSJed Brown     }
6580ced3a2bSJed Brown   }
6599566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6609566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6610ced3a2bSJed Brown 
6620ced3a2bSJed Brown   /* Column indices are in the list of free space */
6630ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6640ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
6669566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
6670ced3a2bSJed Brown 
6680ced3a2bSJed Brown   /* put together the new symbolic matrix */
6699566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
6709566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6710ced3a2bSJed Brown 
6720ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6730ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6744222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
6750ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6760ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6770ced3a2bSJed Brown   c->nonew   = 0;
67826fbe8dcSKarl Rupp 
6794222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6800ced3a2bSJed Brown 
6810ced3a2bSJed Brown   /* set MatInfo */
6820ced3a2bSJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
6830ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6844222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6854222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6864222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6870ced3a2bSJed Brown 
6880ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6890ced3a2bSJed Brown   if (ci[am]) {
6909566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
6919566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
6920ced3a2bSJed Brown   } else {
6939566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
6940ced3a2bSJed Brown   }
6950ced3a2bSJed Brown #endif
6963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6970ced3a2bSJed Brown }
698e9e4536cSHong Zhang 
699d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
700d71ae5a4SJacob Faibussowitsch {
7018a07c6f1SJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
7028a07c6f1SJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
7038a07c6f1SJed Brown   PetscInt          *ci, *cj, *bb;
7048a07c6f1SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
7058a07c6f1SJed Brown   PetscReal          afill;
7068a07c6f1SJed Brown   PetscInt           i, j, col, ndouble = 0;
7070298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
7088a07c6f1SJed Brown   PetscHeap          h;
7098a07c6f1SJed Brown   PetscBT            bt;
7108a07c6f1SJed Brown 
7118a07c6f1SJed Brown   PetscFunctionBegin;
712cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7138a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
7158a07c6f1SJed Brown   ci[0] = 0;
7168a07c6f1SJed Brown 
7178a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7189566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
7192205254eSKarl Rupp 
7208a07c6f1SJed Brown   current_space = free_space;
7218a07c6f1SJed Brown 
7229566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
7239566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
7249566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn, &bt));
7258a07c6f1SJed Brown 
7268a07c6f1SJed Brown   /* Determine ci and cj */
7278a07c6f1SJed Brown   for (i = 0; i < am; i++) {
7288a07c6f1SJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7298a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
7308a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7318a07c6f1SJed Brown     ci[i + 1]            = ci[i];
7328a07c6f1SJed Brown     /* Populate the min heap */
7338a07c6f1SJed Brown     for (j = 0; j < anzi; j++) {
7348a07c6f1SJed Brown       PetscInt brow = acol[j];
7358a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
7368a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7378a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7389566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7398a07c6f1SJed Brown           bb[j]++;
7408a07c6f1SJed Brown           break;
7418a07c6f1SJed Brown         }
7428a07c6f1SJed Brown       }
7438a07c6f1SJed Brown     }
7448a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7459566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
7468a07c6f1SJed Brown     while (j >= 0) {
7478a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7480298fd71SBarry Smith         fptr = NULL;                            /* need PetscBTMemzero */
7499566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
7508a07c6f1SJed Brown         ndouble++;
7518a07c6f1SJed Brown       }
7528a07c6f1SJed Brown       *(current_space->array++) = col;
7538a07c6f1SJed Brown       current_space->local_used++;
7548a07c6f1SJed Brown       current_space->local_remaining--;
7558a07c6f1SJed Brown       ci[i + 1]++;
7568a07c6f1SJed Brown 
7578a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7588a07c6f1SJed Brown       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
7598a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7608a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7619566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7628a07c6f1SJed Brown           bb[j]++;
7638a07c6f1SJed Brown           break;
7648a07c6f1SJed Brown         }
7658a07c6f1SJed Brown       }
7669566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
7678a07c6f1SJed Brown     }
7688a07c6f1SJed Brown     if (fptr) { /* Clear the bits for this row */
7699566063dSJacob Faibussowitsch       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
7708a07c6f1SJed Brown     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7719566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn, bt));
7728a07c6f1SJed Brown     }
7738a07c6f1SJed Brown   }
7749566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7759566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7769566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7778a07c6f1SJed Brown 
7788a07c6f1SJed Brown   /* Column indices are in the list of free space */
7798a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7808a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
7829566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
7838a07c6f1SJed Brown 
7848a07c6f1SJed Brown   /* put together the new symbolic matrix */
7859566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
7869566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
7878a07c6f1SJed Brown 
7888a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7898a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7904222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
7918a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7928a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7938a07c6f1SJed Brown   c->nonew   = 0;
79426fbe8dcSKarl Rupp 
7954222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7968a07c6f1SJed Brown 
7978a07c6f1SJed Brown   /* set MatInfo */
7988a07c6f1SJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
7998a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
8004222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8014222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8024222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8038a07c6f1SJed Brown 
8048a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8058a07c6f1SJed Brown   if (ci[am]) {
8069566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
8079566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
8088a07c6f1SJed Brown   } else {
8099566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
8108a07c6f1SJed Brown   }
8118a07c6f1SJed Brown #endif
8123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8138a07c6f1SJed Brown }
8148a07c6f1SJed Brown 
815d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
816d71ae5a4SJacob Faibussowitsch {
817d7ed1a4dSandi selinger   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
818d7ed1a4dSandi selinger   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
819d7ed1a4dSandi selinger   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
820d7ed1a4dSandi selinger   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
821d7ed1a4dSandi selinger   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
822d7ed1a4dSandi selinger   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
823d7ed1a4dSandi selinger   const PetscInt *brow_ptr[8], *brow_end[8];
824d7ed1a4dSandi selinger   PetscInt        window[8];
825d7ed1a4dSandi selinger   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
826d7ed1a4dSandi selinger   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
827d7ed1a4dSandi selinger   PetscReal       afill;
828f83700f2Sandi selinger   PetscInt       *workj_L1, *workj_L2, *workj_L3;
8297660c4dbSandi selinger   PetscInt        L1_nnz, L2_nnz;
830d7ed1a4dSandi selinger 
831d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
832d7ed1a4dSandi selinger              Because of the way virtual memory works,
833d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
834d7ed1a4dSandi selinger   PetscFunctionBegin;
8359566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
836d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
837d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
838d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
839d7ed1a4dSandi selinger     a_rownnz             = 0;
840d7ed1a4dSandi selinger     for (k = 0; k < anzi; ++k) {
841d7ed1a4dSandi selinger       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
842d7ed1a4dSandi selinger       if (a_rownnz > bn) {
843d7ed1a4dSandi selinger         a_rownnz = bn;
844d7ed1a4dSandi selinger         break;
845d7ed1a4dSandi selinger       }
846d7ed1a4dSandi selinger     }
847d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
848d7ed1a4dSandi selinger   }
849d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8509566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
8519566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
8529566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
853d7ed1a4dSandi selinger 
8547660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8557660c4dbSandi selinger   c_maxmem = 8 * (ai[am] + bi[bm]);
856d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem, &cj));
858d7ed1a4dSandi selinger 
859d7ed1a4dSandi selinger   ci_nnz      = 0;
860d7ed1a4dSandi selinger   ci[0]       = 0;
861d7ed1a4dSandi selinger   worki_L1[0] = 0;
862d7ed1a4dSandi selinger   worki_L2[0] = 0;
863d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
864d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
865d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
866d7ed1a4dSandi selinger     rowsleft             = anzi;
867d7ed1a4dSandi selinger     inputcol_L1          = acol;
868d7ed1a4dSandi selinger     L2_nnz               = 0;
8697660c4dbSandi selinger     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8707660c4dbSandi selinger     worki_L2[1]          = 0;
87108fe1b3cSKarl Rupp     outputi_nnz          = 0;
872d7ed1a4dSandi selinger 
873d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
874d7ed1a4dSandi selinger     while (ci_nnz + a_maxrownnz > c_maxmem) {
875d7ed1a4dSandi selinger       c_maxmem *= 2;
876d7ed1a4dSandi selinger       ndouble++;
8779566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
878d7ed1a4dSandi selinger     }
879d7ed1a4dSandi selinger 
880d7ed1a4dSandi selinger     while (rowsleft) {
881d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
882d7ed1a4dSandi selinger       L1_nrows    = 0;
8837660c4dbSandi selinger       L1_nnz      = 0;
884d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
885d7ed1a4dSandi selinger       inputi      = bi;
886d7ed1a4dSandi selinger       inputj      = bj;
887d7ed1a4dSandi selinger 
888d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
889d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
890f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
891d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
892d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
893a8f51744SPierre Jolivet   do { \
894d7ed1a4dSandi selinger     window_min  = bn; \
8957660c4dbSandi selinger     outputi_nnz = 0; \
8967660c4dbSandi selinger     for (k = 0; k < ANNZ; ++k) { \
897d7ed1a4dSandi selinger       brow_ptr[k] = inputj + inputi[inputcol[k]]; \
898d7ed1a4dSandi selinger       brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
899d7ed1a4dSandi selinger       window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
900d7ed1a4dSandi selinger       window_min  = PetscMin(window[k], window_min); \
901d7ed1a4dSandi selinger     } \
902d7ed1a4dSandi selinger     while (window_min < bn) { \
903d7ed1a4dSandi selinger       outputj[outputi_nnz++] = window_min; \
904d7ed1a4dSandi selinger       /* advance front and compute new minimum */ \
905d7ed1a4dSandi selinger       old_window_min = window_min; \
906d7ed1a4dSandi selinger       window_min     = bn; \
907d7ed1a4dSandi selinger       for (k = 0; k < ANNZ; ++k) { \
908d7ed1a4dSandi selinger         if (window[k] == old_window_min) { \
909d7ed1a4dSandi selinger           brow_ptr[k]++; \
910d7ed1a4dSandi selinger           window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
911d7ed1a4dSandi selinger         } \
912d7ed1a4dSandi selinger         window_min = PetscMin(window[k], window_min); \
913d7ed1a4dSandi selinger       } \
914a8f51744SPierre Jolivet     } \
915a8f51744SPierre Jolivet   } while (0)
916d7ed1a4dSandi selinger 
917d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
918d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
919d7ed1a4dSandi selinger       while (L1_rowsleft) {
9207660c4dbSandi selinger         outputi_nnz = 0;
9217660c4dbSandi selinger         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
9227660c4dbSandi selinger         else outputj = cj + ci_nnz;                /* Merge directly to C */
9237660c4dbSandi selinger 
924d7ed1a4dSandi selinger         switch (L1_rowsleft) {
9259371c9d4SSatish Balay         case 1:
9269371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
927d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
928d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
929d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
930d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
931d7ed1a4dSandi selinger           L1_rowsleft = 0;
932d7ed1a4dSandi selinger           break;
9339371c9d4SSatish Balay         case 2:
9349371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(2);
935d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
936d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
937d7ed1a4dSandi selinger           L1_rowsleft = 0;
938d7ed1a4dSandi selinger           break;
9399371c9d4SSatish Balay         case 3:
9409371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(3);
941d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
942d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
943d7ed1a4dSandi selinger           L1_rowsleft = 0;
944d7ed1a4dSandi selinger           break;
9459371c9d4SSatish Balay         case 4:
9469371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(4);
947d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
948d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
949d7ed1a4dSandi selinger           L1_rowsleft = 0;
950d7ed1a4dSandi selinger           break;
9519371c9d4SSatish Balay         case 5:
9529371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(5);
953d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
954d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
955d7ed1a4dSandi selinger           L1_rowsleft = 0;
956d7ed1a4dSandi selinger           break;
9579371c9d4SSatish Balay         case 6:
9589371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(6);
959d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
960d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
961d7ed1a4dSandi selinger           L1_rowsleft = 0;
962d7ed1a4dSandi selinger           break;
9639371c9d4SSatish Balay         case 7:
9649371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(7);
965d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
966d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
967d7ed1a4dSandi selinger           L1_rowsleft = 0;
968d7ed1a4dSandi selinger           break;
9699371c9d4SSatish Balay         default:
9709371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(8);
971d7ed1a4dSandi selinger           inputcol += 8;
972d7ed1a4dSandi selinger           rowsleft -= 8;
973d7ed1a4dSandi selinger           L1_rowsleft -= 8;
974d7ed1a4dSandi selinger           break;
975d7ed1a4dSandi selinger         }
976d7ed1a4dSandi selinger         inputcol_L1 = inputcol;
9777660c4dbSandi selinger         L1_nnz += outputi_nnz;
9787660c4dbSandi selinger         worki_L1[++L1_nrows] = L1_nnz;
979d7ed1a4dSandi selinger       }
980d7ed1a4dSandi selinger 
981d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
982d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
983d7ed1a4dSandi selinger       if (anzi > 8) {
984d7ed1a4dSandi selinger         inputi      = worki_L1;
985d7ed1a4dSandi selinger         inputj      = workj_L1;
986d7ed1a4dSandi selinger         inputcol    = workcol;
987d7ed1a4dSandi selinger         outputi_nnz = 0;
988d7ed1a4dSandi selinger 
989d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
990d7ed1a4dSandi selinger         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
991d7ed1a4dSandi selinger 
992d7ed1a4dSandi selinger         switch (L1_nrows) {
9939371c9d4SSatish Balay         case 1:
9949371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
995d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
996d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
997d7ed1a4dSandi selinger           break;
998d71ae5a4SJacob Faibussowitsch         case 2:
999d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(2);
1000d71ae5a4SJacob Faibussowitsch           break;
1001d71ae5a4SJacob Faibussowitsch         case 3:
1002d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(3);
1003d71ae5a4SJacob Faibussowitsch           break;
1004d71ae5a4SJacob Faibussowitsch         case 4:
1005d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(4);
1006d71ae5a4SJacob Faibussowitsch           break;
1007d71ae5a4SJacob Faibussowitsch         case 5:
1008d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(5);
1009d71ae5a4SJacob Faibussowitsch           break;
1010d71ae5a4SJacob Faibussowitsch         case 6:
1011d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(6);
1012d71ae5a4SJacob Faibussowitsch           break;
1013d71ae5a4SJacob Faibussowitsch         case 7:
1014d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(7);
1015d71ae5a4SJacob Faibussowitsch           break;
1016d71ae5a4SJacob Faibussowitsch         case 8:
1017d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(8);
1018d71ae5a4SJacob Faibussowitsch           break;
1019d71ae5a4SJacob Faibussowitsch         default:
1020d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1021d7ed1a4dSandi selinger         }
1022d7ed1a4dSandi selinger         L2_nnz += outputi_nnz;
1023d7ed1a4dSandi selinger         worki_L2[++L2_nrows] = L2_nnz;
1024d7ed1a4dSandi selinger 
10257660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10267660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1027d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1028d7ed1a4dSandi selinger           inputi      = worki_L2;
1029d7ed1a4dSandi selinger           inputj      = workj_L2;
1030d7ed1a4dSandi selinger           inputcol    = workcol;
1031d7ed1a4dSandi selinger           outputi_nnz = 0;
1032f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1033d7ed1a4dSandi selinger           else outputj = cj + ci_nnz;
1034d7ed1a4dSandi selinger           switch (L2_nrows) {
10359371c9d4SSatish Balay           case 1:
10369371c9d4SSatish Balay             brow_ptr[0] = inputj + inputi[inputcol[0]];
1037d7ed1a4dSandi selinger             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1038d7ed1a4dSandi selinger             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1039d7ed1a4dSandi selinger             break;
1040d71ae5a4SJacob Faibussowitsch           case 2:
1041d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(2);
1042d71ae5a4SJacob Faibussowitsch             break;
1043d71ae5a4SJacob Faibussowitsch           case 3:
1044d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(3);
1045d71ae5a4SJacob Faibussowitsch             break;
1046d71ae5a4SJacob Faibussowitsch           case 4:
1047d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(4);
1048d71ae5a4SJacob Faibussowitsch             break;
1049d71ae5a4SJacob Faibussowitsch           case 5:
1050d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(5);
1051d71ae5a4SJacob Faibussowitsch             break;
1052d71ae5a4SJacob Faibussowitsch           case 6:
1053d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(6);
1054d71ae5a4SJacob Faibussowitsch             break;
1055d71ae5a4SJacob Faibussowitsch           case 7:
1056d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(7);
1057d71ae5a4SJacob Faibussowitsch             break;
1058d71ae5a4SJacob Faibussowitsch           case 8:
1059d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(8);
1060d71ae5a4SJacob Faibussowitsch             break;
1061d71ae5a4SJacob Faibussowitsch           default:
1062d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1063d7ed1a4dSandi selinger           }
1064d7ed1a4dSandi selinger           L2_nrows    = 1;
10657660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10667660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10677660c4dbSandi selinger           /* Copy to workj_L2 */
1068d7ed1a4dSandi selinger           if (rowsleft) {
10697660c4dbSandi selinger             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1070d7ed1a4dSandi selinger           }
1071d7ed1a4dSandi selinger         }
1072d7ed1a4dSandi selinger       }
1073d7ed1a4dSandi selinger     } /* while (rowsleft) */
1074d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1075d7ed1a4dSandi selinger 
1076d7ed1a4dSandi selinger     /* terminate current row */
1077d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1078d7ed1a4dSandi selinger     ci[i + 1] = ci_nnz;
1079d7ed1a4dSandi selinger   }
1080d7ed1a4dSandi selinger 
1081d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10829566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
10839566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1084d7ed1a4dSandi selinger 
1085d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1086d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10874222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
1088d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1089d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1090d7ed1a4dSandi selinger   c->nonew   = 0;
1091d7ed1a4dSandi selinger 
10924222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1093d7ed1a4dSandi selinger 
1094d7ed1a4dSandi selinger   /* set MatInfo */
1095d7ed1a4dSandi selinger   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1096d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10974222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10984222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10994222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1100d7ed1a4dSandi selinger 
1101d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1102d7ed1a4dSandi selinger   if (ci[am]) {
11039566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
11049566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1105d7ed1a4dSandi selinger   } else {
11069566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1107d7ed1a4dSandi selinger   }
1108d7ed1a4dSandi selinger #endif
1109d7ed1a4dSandi selinger 
1110d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
11119566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
11129566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
11139566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
11143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1115d7ed1a4dSandi selinger }
1116d7ed1a4dSandi selinger 
1117cd093f1dSJed Brown /* concatenate unique entries and then sort */
1118d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1119d71ae5a4SJacob Faibussowitsch {
1120cd093f1dSJed Brown   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1121cd093f1dSJed Brown   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
11228c78258cSHong Zhang   PetscInt       *ci, *cj, bcol;
1123cd093f1dSJed Brown   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1124cd093f1dSJed Brown   PetscReal       afill;
1125cd093f1dSJed Brown   PetscInt        i, j, ndouble = 0;
1126cd093f1dSJed Brown   PetscSegBuffer  seg, segrow;
1127cd093f1dSJed Brown   char           *seen;
1128cd093f1dSJed Brown 
1129cd093f1dSJed Brown   PetscFunctionBegin;
11309566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
1131cd093f1dSJed Brown   ci[0] = 0;
1132cd093f1dSJed Brown 
1133cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
11349566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
11359566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
11369566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn, &seen));
1137cd093f1dSJed Brown 
1138cd093f1dSJed Brown   /* Determine ci and cj */
1139cd093f1dSJed Brown   for (i = 0; i < am; i++) {
1140cd093f1dSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1141cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
1142cd093f1dSJed Brown     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
11438c78258cSHong Zhang 
1144cd093f1dSJed Brown     /* Pack segrow */
1145cd093f1dSJed Brown     for (j = 0; j < anzi; j++) {
1146cd093f1dSJed Brown       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1147cd093f1dSJed Brown       for (k = bjstart; k < bjend; k++) {
11488c78258cSHong Zhang         bcol = bj[k];
1149cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1150cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11519566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1152cd093f1dSJed Brown           *slot      = bcol;
1153cd093f1dSJed Brown           seen[bcol] = 1;
1154cd093f1dSJed Brown           packlen++;
1155cd093f1dSJed Brown         }
1156cd093f1dSJed Brown       }
1157cd093f1dSJed Brown     }
11588c78258cSHong Zhang 
11598c78258cSHong Zhang     /* Check i-th diagonal entry */
11608c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11618c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11629566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
11638c78258cSHong Zhang       *slot   = i;
11648c78258cSHong Zhang       seen[i] = 1;
11658c78258cSHong Zhang       packlen++;
11668c78258cSHong Zhang     }
11678c78258cSHong Zhang 
11689566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
11699566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow, crow));
11709566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen, crow));
1171cd093f1dSJed Brown     ci[i + 1] = ci[i] + packlen;
1172cd093f1dSJed Brown     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1173cd093f1dSJed Brown   }
11749566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11759566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1176cd093f1dSJed Brown 
1177cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11789566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
11799566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1180cd093f1dSJed Brown 
1181cd093f1dSJed Brown   /* put together the new symbolic matrix */
11829566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
11839566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1184cd093f1dSJed Brown 
1185cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1186cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11874222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
1188cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1189cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1190cd093f1dSJed Brown   c->nonew   = 0;
1191cd093f1dSJed Brown 
11924222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1193cd093f1dSJed Brown 
1194cd093f1dSJed Brown   /* set MatInfo */
11952a09556fSStefano Zampini   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1196cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11974222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11984222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11994222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1200cd093f1dSJed Brown 
1201cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1202cd093f1dSJed Brown   if (ci[am]) {
12039566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
12049566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1205cd093f1dSJed Brown   } else {
12069566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1207cd093f1dSJed Brown   }
1208cd093f1dSJed Brown #endif
12093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1210cd093f1dSJed Brown }
1211cd093f1dSJed Brown 
121266976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1213d71ae5a4SJacob Faibussowitsch {
12146718818eSStefano Zampini   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
12152128a86cSHong Zhang 
12162128a86cSHong Zhang   PetscFunctionBegin;
12179566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
12189566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
12199566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
12209566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
12213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12222128a86cSHong Zhang }
12232128a86cSHong Zhang 
1224d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1225d71ae5a4SJacob Faibussowitsch {
122681d82fe4SHong Zhang   Mat                  Bt;
122740192850SHong Zhang   Mat_MatMatTransMult *abt;
12284222ddf1SHong Zhang   Mat_Product         *product = C->product;
12296718818eSStefano Zampini   char                *alg;
1230d2853540SHong Zhang 
123181d82fe4SHong Zhang   PetscFunctionBegin;
123228b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
123328b400f6SJacob Faibussowitsch   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
12346718818eSStefano Zampini 
123581d82fe4SHong Zhang   /* create symbolic Bt */
12367fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B, &Bt));
12379566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
12389566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
123981d82fe4SHong Zhang 
124081d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12419566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg, &alg));
12429566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
12439566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
12449566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
12459566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
124681d82fe4SHong Zhang 
1247a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12489566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12492128a86cSHong Zhang 
12506718818eSStefano Zampini   product->data    = abt;
12516718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12526718818eSStefano Zampini 
12534222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12542128a86cSHong Zhang 
12554222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12569566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
125740192850SHong Zhang   if (abt->usecoloring) {
1258b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1259b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1260335efc43SPeter Brune     MatColoring          coloring;
12612128a86cSHong Zhang     ISColoring           iscoloring;
12622128a86cSHong Zhang     Mat                  Bt_dense, C_dense;
1263e8354b3eSHong Zhang 
12644222ddf1SHong Zhang     /* inode causes memory problem */
12659566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
12664222ddf1SHong Zhang 
12679566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C, &coloring));
12689566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring, 2));
12699566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
12709566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12719566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring, &iscoloring));
12729566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12739566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
12742205254eSKarl Rupp 
127540192850SHong Zhang     abt->matcoloring = matcoloring;
12762205254eSKarl Rupp 
12779566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12782128a86cSHong Zhang 
12792128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12809566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
12819566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
12829566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
12839566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
12842205254eSKarl Rupp 
12852128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
128640192850SHong Zhang     abt->Bt_den         = Bt_dense;
12872128a86cSHong Zhang 
12889566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
12899566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
12909566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense, MATSEQDENSE));
12919566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
12922205254eSKarl Rupp 
12932128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
129440192850SHong Zhang     abt->ABt_den        = C_dense;
1295f94ccd6cSHong Zhang 
1296f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12971ce71dffSSatish Balay     {
12984222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
12999371c9d4SSatish Balay       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
13009371c9d4SSatish Balay                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
13011ce71dffSSatish Balay     }
1302f94ccd6cSHong Zhang #endif
13032128a86cSHong Zhang   }
1304e99be685SHong Zhang   /* clean up */
13059566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
13063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13075df89d91SHong Zhang }
13085df89d91SHong Zhang 
1309d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1310d71ae5a4SJacob Faibussowitsch {
13115df89d91SHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1312e2cac8caSJed Brown   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
131381d82fe4SHong Zhang   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
13145df89d91SHong Zhang   PetscLogDouble       flops = 0.0;
1315aa1aec99SHong Zhang   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
13166718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13176718818eSStefano Zampini   Mat_Product         *product = C->product;
13185df89d91SHong Zhang 
13195df89d91SHong Zhang   PetscFunctionBegin;
132028b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
13216718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
132228b400f6SJacob Faibussowitsch   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
132358ed3ceaSHong Zhang   /* clear old values in C */
132458ed3ceaSHong Zhang   if (!c->a) {
13259566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
132658ed3ceaSHong Zhang     c->a      = ca;
132758ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
132858ed3ceaSHong Zhang   } else {
132958ed3ceaSHong Zhang     ca = c->a;
13309566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
133158ed3ceaSHong Zhang   }
133258ed3ceaSHong Zhang 
133340192850SHong Zhang   if (abt->usecoloring) {
133440192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
133540192850SHong Zhang     Mat                  Bt_dense, C_dense = abt->ABt_den;
1336c8db22f6SHong Zhang 
1337b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
133840192850SHong Zhang     Bt_dense = abt->Bt_den;
13399566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1340c8db22f6SHong Zhang 
1341c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13429566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1343c8db22f6SHong Zhang 
13442128a86cSHong Zhang     /* Recover C from C_dense */
13459566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
13463ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1347c8db22f6SHong Zhang   }
1348c8db22f6SHong Zhang 
13491fa1209cSHong Zhang   for (i = 0; i < cm; i++) {
135081d82fe4SHong Zhang     anzi = ai[i + 1] - ai[i];
135181d82fe4SHong Zhang     acol = aj + ai[i];
13526973769bSHong Zhang     aval = aa + ai[i];
13531fa1209cSHong Zhang     cnzi = ci[i + 1] - ci[i];
13541fa1209cSHong Zhang     ccol = cj + ci[i];
13556973769bSHong Zhang     cval = ca + ci[i];
13561fa1209cSHong Zhang     for (j = 0; j < cnzi; j++) {
135781d82fe4SHong Zhang       brow = ccol[j];
135881d82fe4SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
135981d82fe4SHong Zhang       bcol = bj + bi[brow];
13606973769bSHong Zhang       bval = ba + bi[brow];
13616973769bSHong Zhang 
136281d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
13639371c9d4SSatish Balay       nexta = 0;
13649371c9d4SSatish Balay       nextb = 0;
136581d82fe4SHong Zhang       while (nexta < anzi && nextb < bnzj) {
13667b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
136781d82fe4SHong Zhang         if (nexta == anzi) break;
13687b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
136981d82fe4SHong Zhang         if (nextb == bnzj) break;
137081d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13716973769bSHong Zhang           cval[j] += aval[nexta] * bval[nextb];
13729371c9d4SSatish Balay           nexta++;
13739371c9d4SSatish Balay           nextb++;
137481d82fe4SHong Zhang           flops += 2;
13751fa1209cSHong Zhang         }
13761fa1209cSHong Zhang       }
137781d82fe4SHong Zhang     }
137881d82fe4SHong Zhang   }
13799566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
13809566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
13819566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13835df89d91SHong Zhang }
13845df89d91SHong Zhang 
1385d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1386d71ae5a4SJacob Faibussowitsch {
13876718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
13886d373c3eSHong Zhang 
13896d373c3eSHong Zhang   PetscFunctionBegin;
13909566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
13911baa6e33SBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
13929566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13933ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13946d373c3eSHong Zhang }
13956d373c3eSHong Zhang 
1396d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1397d71ae5a4SJacob Faibussowitsch {
1398089a957eSStefano Zampini   Mat          At      = NULL;
13994222ddf1SHong Zhang   Mat_Product *product = C->product;
1400089a957eSStefano Zampini   PetscBool    flg, def, square;
1401bc011b1eSHong Zhang 
1402bc011b1eSHong Zhang   PetscFunctionBegin;
1403089a957eSStefano Zampini   MatCheckProduct(C, 4);
1404b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
14054222ddf1SHong Zhang   /* outerproduct */
14069566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
14074222ddf1SHong Zhang   if (flg) {
1408bc011b1eSHong Zhang     /* create symbolic At */
1409089a957eSStefano Zampini     if (!square) {
14107fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A, &At));
14119566063dSJacob Faibussowitsch       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
14129566063dSJacob Faibussowitsch       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1413089a957eSStefano Zampini     }
1414bc011b1eSHong Zhang     /* get symbolic C=At*B */
14159566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14169566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1417bc011b1eSHong Zhang 
1418bc011b1eSHong Zhang     /* clean up */
141948a46eb9SPierre Jolivet     if (!square) PetscCall(MatDestroy(&At));
14206d373c3eSHong Zhang 
14214222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14229566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
14233ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14244222ddf1SHong Zhang   }
14254222ddf1SHong Zhang 
14264222ddf1SHong Zhang   /* matmatmult */
14279566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &def));
14289566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1429089a957eSStefano Zampini   if (flg || def) {
14304222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14314222ddf1SHong Zhang 
143228b400f6SJacob Faibussowitsch     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
14339566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
143448a46eb9SPierre Jolivet     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
14359566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14369566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
14379566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "at*b"));
14386718818eSStefano Zampini     product->data    = atb;
14396718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14404222ddf1SHong Zhang     atb->At          = At;
14414222ddf1SHong Zhang 
14424222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14433ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14444222ddf1SHong Zhang   }
14454222ddf1SHong Zhang 
14464222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1447bc011b1eSHong Zhang }
1448bc011b1eSHong Zhang 
1449d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1450d71ae5a4SJacob Faibussowitsch {
14510fbc74f4SHong Zhang   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1452d0f46423SBarry Smith   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1453d0f46423SBarry Smith   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1454d13dce4bSSatish Balay   PetscLogDouble flops = 0.0;
1455aa1aec99SHong Zhang   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1456bc011b1eSHong Zhang 
1457bc011b1eSHong Zhang   PetscFunctionBegin;
1458aa1aec99SHong Zhang   if (!c->a) {
14599566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
14602205254eSKarl Rupp 
1461aa1aec99SHong Zhang     c->a      = ca;
1462aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1463aa1aec99SHong Zhang   } else {
1464aa1aec99SHong Zhang     ca = c->a;
14659566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm]));
1466aa1aec99SHong Zhang   }
1467bc011b1eSHong Zhang 
1468bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1469bc011b1eSHong Zhang   for (i = 0; i < am; i++) {
1470bc011b1eSHong Zhang     bj   = b->j + bi[i];
1471bc011b1eSHong Zhang     ba   = b->a + bi[i];
1472bc011b1eSHong Zhang     bnzi = bi[i + 1] - bi[i];
1473bc011b1eSHong Zhang     anzi = ai[i + 1] - ai[i];
1474bc011b1eSHong Zhang     for (j = 0; j < anzi; j++) {
1475bc011b1eSHong Zhang       nextb = 0;
14760fbc74f4SHong Zhang       crow  = *aj++;
1477bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1478bc011b1eSHong Zhang       caj   = ca + ci[crow];
1479bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1480bc011b1eSHong Zhang       for (k = 0; nextb < bnzi; k++) {
14810fbc74f4SHong Zhang         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
14820fbc74f4SHong Zhang           caj[k] += (*aa) * (*(ba + nextb));
1483bc011b1eSHong Zhang           nextb++;
1484bc011b1eSHong Zhang         }
1485bc011b1eSHong Zhang       }
1486bc011b1eSHong Zhang       flops += 2 * bnzi;
14870fbc74f4SHong Zhang       aa++;
1488bc011b1eSHong Zhang     }
1489bc011b1eSHong Zhang   }
1490bc011b1eSHong Zhang 
1491bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14929566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
14939566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
14949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
14953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1496bc011b1eSHong Zhang }
14979123193aSHong Zhang 
1498d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1499d71ae5a4SJacob Faibussowitsch {
15009123193aSHong Zhang   PetscFunctionBegin;
15019566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15024222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
15033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
15049123193aSHong Zhang }
15059123193aSHong Zhang 
1506d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1507d71ae5a4SJacob Faibussowitsch {
1508f32d5d43SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
15091ca9667aSStefano Zampini   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1510a4af7ca8SStefano Zampini   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1511daf57211SHong Zhang   const PetscInt    *aj;
151275f6d85dSStefano Zampini   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
151375f6d85dSStefano Zampini   PetscInt           clda;
151475f6d85dSStefano Zampini   PetscInt           am4, bm4, col, i, j, n;
15159123193aSHong Zhang 
15169123193aSHong Zhang   PetscFunctionBegin;
15173ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
15189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &av));
151993aa15f2SStefano Zampini   if (add) {
15209566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C, &c));
152193aa15f2SStefano Zampini   } else {
15229566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C, &c));
152393aa15f2SStefano Zampini   }
15249566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B, &b));
15259566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &bm));
15269566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C, &clda));
152775f6d85dSStefano Zampini   am4 = 4 * clda;
152875f6d85dSStefano Zampini   bm4 = 4 * bm;
1529*5c0db29aSPierre Jolivet   if (b) {
15309371c9d4SSatish Balay     b1 = b;
15319371c9d4SSatish Balay     b2 = b1 + bm;
15329371c9d4SSatish Balay     b3 = b2 + bm;
15339371c9d4SSatish Balay     b4 = b3 + bm;
1534*5c0db29aSPierre Jolivet   } else b1 = b2 = b3 = b4 = NULL;
15359371c9d4SSatish Balay   c1 = c;
15369371c9d4SSatish Balay   c2 = c1 + clda;
15379371c9d4SSatish Balay   c3 = c2 + clda;
15389371c9d4SSatish Balay   c4 = c3 + clda;
15391ca9667aSStefano Zampini   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
15401ca9667aSStefano Zampini     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1541f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1542f32d5d43SBarry Smith       n                 = a->i[i + 1] - a->i[i];
1543*5c0db29aSPierre Jolivet       aj                = a->j ? a->j + a->i[i] : NULL;
1544*5c0db29aSPierre Jolivet       aa                = av ? av + a->i[i] : NULL;
1545f32d5d43SBarry Smith       for (j = 0; j < n; j++) {
15461ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15471ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1548730858b9SHong Zhang         r1 += aatmp * b1[ajtmp];
1549730858b9SHong Zhang         r2 += aatmp * b2[ajtmp];
1550730858b9SHong Zhang         r3 += aatmp * b3[ajtmp];
1551730858b9SHong Zhang         r4 += aatmp * b4[ajtmp];
15529123193aSHong Zhang       }
155393aa15f2SStefano Zampini       if (add) {
155487753ddeSHong Zhang         c1[i] += r1;
155587753ddeSHong Zhang         c2[i] += r2;
155687753ddeSHong Zhang         c3[i] += r3;
155787753ddeSHong Zhang         c4[i] += r4;
155893aa15f2SStefano Zampini       } else {
155993aa15f2SStefano Zampini         c1[i] = r1;
156093aa15f2SStefano Zampini         c2[i] = r2;
156193aa15f2SStefano Zampini         c3[i] = r3;
156293aa15f2SStefano Zampini         c4[i] = r4;
156393aa15f2SStefano Zampini       }
1564f32d5d43SBarry Smith     }
1565*5c0db29aSPierre Jolivet     if (b) {
15669371c9d4SSatish Balay       b1 += bm4;
15679371c9d4SSatish Balay       b2 += bm4;
15689371c9d4SSatish Balay       b3 += bm4;
15699371c9d4SSatish Balay       b4 += bm4;
1570*5c0db29aSPierre Jolivet     }
15719371c9d4SSatish Balay     c1 += am4;
15729371c9d4SSatish Balay     c2 += am4;
15739371c9d4SSatish Balay     c3 += am4;
15749371c9d4SSatish Balay     c4 += am4;
1575f32d5d43SBarry Smith   }
157693aa15f2SStefano Zampini   /* process remaining columns */
157793aa15f2SStefano Zampini   if (col != cn) {
157893aa15f2SStefano Zampini     PetscInt rc = cn - col;
157993aa15f2SStefano Zampini 
158093aa15f2SStefano Zampini     if (rc == 1) {
158193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
1582f32d5d43SBarry Smith         r1 = 0.0;
1583f32d5d43SBarry Smith         n  = a->i[i + 1] - a->i[i];
1584f32d5d43SBarry Smith         aj = a->j + a->i[i];
1585a4af7ca8SStefano Zampini         aa = av + a->i[i];
158693aa15f2SStefano Zampini         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
158793aa15f2SStefano Zampini         if (add) c1[i] += r1;
158893aa15f2SStefano Zampini         else c1[i] = r1;
158993aa15f2SStefano Zampini       }
159093aa15f2SStefano Zampini     } else if (rc == 2) {
159193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
159293aa15f2SStefano Zampini         r1 = r2 = 0.0;
159393aa15f2SStefano Zampini         n       = a->i[i + 1] - a->i[i];
159493aa15f2SStefano Zampini         aj      = a->j + a->i[i];
159593aa15f2SStefano Zampini         aa      = av + a->i[i];
1596f32d5d43SBarry Smith         for (j = 0; j < n; j++) {
159793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
159893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
159993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
160093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
1601f32d5d43SBarry Smith         }
160293aa15f2SStefano Zampini         if (add) {
160387753ddeSHong Zhang           c1[i] += r1;
160493aa15f2SStefano Zampini           c2[i] += r2;
160593aa15f2SStefano Zampini         } else {
160693aa15f2SStefano Zampini           c1[i] = r1;
160793aa15f2SStefano Zampini           c2[i] = r2;
1608f32d5d43SBarry Smith         }
160993aa15f2SStefano Zampini       }
161093aa15f2SStefano Zampini     } else {
161193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
161293aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
161393aa15f2SStefano Zampini         n            = a->i[i + 1] - a->i[i];
1614*5c0db29aSPierre Jolivet         aj           = a->j ? a->j + a->i[i] : NULL;
1615*5c0db29aSPierre Jolivet         aa           = av ? av + a->i[i] : NULL;
161693aa15f2SStefano Zampini         for (j = 0; j < n; j++) {
161793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
161893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
161993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
162093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
162193aa15f2SStefano Zampini           r3 += aatmp * b3[ajtmp];
162293aa15f2SStefano Zampini         }
162393aa15f2SStefano Zampini         if (add) {
162493aa15f2SStefano Zampini           c1[i] += r1;
162593aa15f2SStefano Zampini           c2[i] += r2;
162693aa15f2SStefano Zampini           c3[i] += r3;
162793aa15f2SStefano Zampini         } else {
162893aa15f2SStefano Zampini           c1[i] = r1;
162993aa15f2SStefano Zampini           c2[i] = r2;
163093aa15f2SStefano Zampini           c3[i] = r3;
163193aa15f2SStefano Zampini         }
163293aa15f2SStefano Zampini       }
163393aa15f2SStefano Zampini     }
1634f32d5d43SBarry Smith   }
16359566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
163693aa15f2SStefano Zampini   if (add) {
16379566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C, &c));
163893aa15f2SStefano Zampini   } else {
16399566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C, &c));
164093aa15f2SStefano Zampini   }
16419566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B, &b));
16429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
16433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1644f32d5d43SBarry Smith }
1645f32d5d43SBarry Smith 
1646d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1647d71ae5a4SJacob Faibussowitsch {
1648f32d5d43SBarry Smith   PetscFunctionBegin;
164908401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
165008401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
165108401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
16524324174eSBarry Smith 
16539566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
16543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16559123193aSHong Zhang }
1656b1683b59SHong Zhang 
1657d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1658d71ae5a4SJacob Faibussowitsch {
16594222ddf1SHong Zhang   PetscFunctionBegin;
16604222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16614222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16634222ddf1SHong Zhang }
16644222ddf1SHong Zhang 
16656718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
16666718818eSStefano Zampini 
1667d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1668d71ae5a4SJacob Faibussowitsch {
16694222ddf1SHong Zhang   PetscFunctionBegin;
16706718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16714222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16736718818eSStefano Zampini }
16746718818eSStefano Zampini 
1675d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1676d71ae5a4SJacob Faibussowitsch {
16776718818eSStefano Zampini   PetscFunctionBegin;
16786718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16796718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16814222ddf1SHong Zhang }
16824222ddf1SHong Zhang 
1683d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1684d71ae5a4SJacob Faibussowitsch {
16854222ddf1SHong Zhang   Mat_Product *product = C->product;
16864222ddf1SHong Zhang 
16874222ddf1SHong Zhang   PetscFunctionBegin;
16884222ddf1SHong Zhang   switch (product->type) {
1689d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
1690d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1691d71ae5a4SJacob Faibussowitsch     break;
1692d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
1693d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1694d71ae5a4SJacob Faibussowitsch     break;
1695d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
1696d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1697d71ae5a4SJacob Faibussowitsch     break;
1698d71ae5a4SJacob Faibussowitsch   default:
1699d71ae5a4SJacob Faibussowitsch     break;
17004222ddf1SHong Zhang   }
17013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17024222ddf1SHong Zhang }
17032ef1f0ffSBarry Smith 
1704d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1705d71ae5a4SJacob Faibussowitsch {
17064222ddf1SHong Zhang   Mat_Product *product = C->product;
17074222ddf1SHong Zhang   Mat          A       = product->A;
17084222ddf1SHong Zhang   PetscBool    baij;
17094222ddf1SHong Zhang 
17104222ddf1SHong Zhang   PetscFunctionBegin;
17119566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
17124222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
17134222ddf1SHong Zhang     PetscBool sbaij;
17149566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
171528b400f6SJacob Faibussowitsch     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
17164222ddf1SHong Zhang 
17174222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17184222ddf1SHong Zhang   } else { /* A is seqbaij */
17194222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17204222ddf1SHong Zhang   }
17214222ddf1SHong Zhang 
17224222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17244222ddf1SHong Zhang }
17254222ddf1SHong Zhang 
1726d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1727d71ae5a4SJacob Faibussowitsch {
17284222ddf1SHong Zhang   Mat_Product *product = C->product;
17294222ddf1SHong Zhang 
17304222ddf1SHong Zhang   PetscFunctionBegin;
17316718818eSStefano Zampini   MatCheckProduct(C, 1);
173228b400f6SJacob Faibussowitsch   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1733b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
17343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17354222ddf1SHong Zhang }
17366718818eSStefano Zampini 
1737d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1738d71ae5a4SJacob Faibussowitsch {
17394222ddf1SHong Zhang   PetscFunctionBegin;
17404222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17414222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17434222ddf1SHong Zhang }
17444222ddf1SHong Zhang 
1745d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1746d71ae5a4SJacob Faibussowitsch {
17474222ddf1SHong Zhang   Mat_Product *product = C->product;
17484222ddf1SHong Zhang 
17494222ddf1SHong Zhang   PetscFunctionBegin;
175048a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
17513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17524222ddf1SHong Zhang }
17534222ddf1SHong Zhang 
1754d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1755d71ae5a4SJacob Faibussowitsch {
17562f5f1f90SHong Zhang   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
17572f5f1f90SHong Zhang   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
17582f5f1f90SHong Zhang   PetscInt     *bi = b->i, *bj = b->j;
17592f5f1f90SHong Zhang   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
17602f5f1f90SHong Zhang   MatScalar    *btval, *btval_den, *ba = b->a;
17612f5f1f90SHong Zhang   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1762c8db22f6SHong Zhang 
1763c8db22f6SHong Zhang   PetscFunctionBegin;
17642f5f1f90SHong Zhang   btval_den = btdense->v;
17659566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den, m * n));
17662f5f1f90SHong Zhang   for (k = 0; k < ncolors; k++) {
17672f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17682f5f1f90SHong Zhang     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1769d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17702f5f1f90SHong Zhang       btcol = bj + bi[col];
17712f5f1f90SHong Zhang       btval = ba + bi[col];
17722f5f1f90SHong Zhang       anz   = bi[col + 1] - bi[col];
1773d2853540SHong Zhang       for (j = 0; j < anz; j++) {
17742f5f1f90SHong Zhang         brow            = btcol[j];
17752f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1776c8db22f6SHong Zhang       }
1777c8db22f6SHong Zhang     }
17782f5f1f90SHong Zhang     btval_den += m;
1779c8db22f6SHong Zhang   }
17803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1781c8db22f6SHong Zhang }
1782c8db22f6SHong Zhang 
1783d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1784d71ae5a4SJacob Faibussowitsch {
1785b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
17861683a169SBarry Smith   const PetscScalar *ca_den, *ca_den_ptr;
17871683a169SBarry Smith   PetscScalar       *ca = csp->a;
1788f99a636bSHong Zhang   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1789e88777f2SHong Zhang   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1790077f23c2SHong Zhang   PetscInt           nrows, *row, *idx;
1791077f23c2SHong Zhang   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
17928972f759SHong Zhang 
17938972f759SHong Zhang   PetscFunctionBegin;
17949566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1795f99a636bSHong Zhang 
1796077f23c2SHong Zhang   if (brows > 0) {
1797077f23c2SHong Zhang     PetscInt *lstart, row_end, row_start;
1798980ae229SHong Zhang     lstart = matcoloring->lstart;
17999566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart, ncolors));
1800eeb4fd02SHong Zhang 
1801077f23c2SHong Zhang     row_end = brows;
1802eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1803077f23c2SHong Zhang     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1804077f23c2SHong Zhang       ca_den_ptr = ca_den;
1805980ae229SHong Zhang       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1806eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1807eeb4fd02SHong Zhang         row   = rows + colorforrow[k];
1808eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1809eeb4fd02SHong Zhang         for (l = lstart[k]; l < nrows; l++) {
1810eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1811eeb4fd02SHong Zhang             lstart[k] = l;
1812eeb4fd02SHong Zhang             break;
1813eeb4fd02SHong Zhang           } else {
1814077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1815eeb4fd02SHong Zhang           }
1816eeb4fd02SHong Zhang         }
1817077f23c2SHong Zhang         ca_den_ptr += m;
1818eeb4fd02SHong Zhang       }
1819077f23c2SHong Zhang       row_end += brows;
1820eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1821eeb4fd02SHong Zhang     }
1822077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1823077f23c2SHong Zhang     ca_den_ptr = ca_den;
1824077f23c2SHong Zhang     for (k = 0; k < ncolors; k++) {
1825077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1826077f23c2SHong Zhang       row   = rows + colorforrow[k];
1827077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1828ad540459SPierre Jolivet       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1829077f23c2SHong Zhang       ca_den_ptr += m;
1830077f23c2SHong Zhang     }
1831f99a636bSHong Zhang   }
1832f99a636bSHong Zhang 
18339566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1834f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1835077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18369566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1837e88777f2SHong Zhang   } else {
18389566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1839e88777f2SHong Zhang   }
1840f99a636bSHong Zhang #endif
18413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18428972f759SHong Zhang }
18438972f759SHong Zhang 
1844d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1845d71ae5a4SJacob Faibussowitsch {
1846e88777f2SHong Zhang   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
18471a83f524SJed Brown   const PetscInt *is, *ci, *cj, *row_idx;
1848b28d80bdSHong Zhang   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1849b1683b59SHong Zhang   IS             *isa;
1850b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1851e88777f2SHong Zhang   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1852e88777f2SHong Zhang   PetscInt       *colorforcol, *columns, *columns_i, brows;
1853e88777f2SHong Zhang   PetscBool       flg;
1854b1683b59SHong Zhang 
1855b1683b59SHong Zhang   PetscFunctionBegin;
18569566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1857e99be685SHong Zhang 
18587ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1859e88777f2SHong Zhang   Nbs       = mat->cmap->N / bs;
1860b1683b59SHong Zhang   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1861e88777f2SHong Zhang   c->N      = Nbs;
1862e88777f2SHong Zhang   c->m      = c->M;
1863b1683b59SHong Zhang   c->rstart = 0;
1864e88777f2SHong Zhang   c->brows  = 100;
1865b1683b59SHong Zhang 
1866b1683b59SHong Zhang   c->ncolors = nis;
18679566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
18689566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
18699566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1870e88777f2SHong Zhang 
1871e88777f2SHong Zhang   brows = c->brows;
18729566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1873e88777f2SHong Zhang   if (flg) c->brows = brows;
187448a46eb9SPierre Jolivet   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
18752205254eSKarl Rupp 
1876d2853540SHong Zhang   colorforrow[0] = 0;
1877d2853540SHong Zhang   rows_i         = rows;
1878f99a636bSHong Zhang   den2sp_i       = den2sp;
1879b1683b59SHong Zhang 
18809566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
18819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs + 1, &columns));
18822205254eSKarl Rupp 
1883d2853540SHong Zhang   colorforcol[0] = 0;
1884d2853540SHong Zhang   columns_i      = columns;
1885d2853540SHong Zhang 
1886077f23c2SHong Zhang   /* get column-wise storage of mat */
18879566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1888b1683b59SHong Zhang 
1889b28d80bdSHong Zhang   cm = c->m;
18909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &rowhit));
18919566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1892f99a636bSHong Zhang   for (i = 0; i < nis; i++) { /* loop over color */
18939566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i], &n));
18949566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i], &is));
18952205254eSKarl Rupp 
1896b1683b59SHong Zhang     c->ncolumns[i] = n;
18971baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1898d2853540SHong Zhang     colorforcol[i + 1] = colorforcol[i] + n;
1899d2853540SHong Zhang     columns_i += n;
1900b1683b59SHong Zhang 
1901b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
19029566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit, cm));
1903e99be685SHong Zhang 
1904b7caf3d6SHong Zhang     for (j = 0; j < n; j++) { /* loop over columns*/
1905b1683b59SHong Zhang       col     = is[j];
1906d2853540SHong Zhang       row_idx = cj + ci[col];
1907b1683b59SHong Zhang       m       = ci[col + 1] - ci[col];
1908b7caf3d6SHong Zhang       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1909e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1910d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1911b1683b59SHong Zhang       }
1912b1683b59SHong Zhang     }
1913b1683b59SHong Zhang     /* count the number of hits */
1914b1683b59SHong Zhang     nrows = 0;
1915e8354b3eSHong Zhang     for (j = 0; j < cm; j++) {
1916b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1917b1683b59SHong Zhang     }
1918b1683b59SHong Zhang     c->nrows[i]        = nrows;
1919d2853540SHong Zhang     colorforrow[i + 1] = colorforrow[i] + nrows;
1920d2853540SHong Zhang 
1921b1683b59SHong Zhang     nrows = 0;
1922b7caf3d6SHong Zhang     for (j = 0; j < cm; j++) { /* loop over rows */
1923b1683b59SHong Zhang       if (rowhit[j]) {
1924d2853540SHong Zhang         rows_i[nrows]   = j;
192512b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1926b1683b59SHong Zhang         nrows++;
1927b1683b59SHong Zhang       }
1928b1683b59SHong Zhang     }
1929e88777f2SHong Zhang     den2sp_i += nrows;
1930077f23c2SHong Zhang 
19319566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i], &is));
1932f99a636bSHong Zhang     rows_i += nrows;
1933b1683b59SHong Zhang   }
19349566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
19359566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
19369566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
19372c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1938b28d80bdSHong Zhang 
1939d2853540SHong Zhang   c->colorforrow = colorforrow;
1940d2853540SHong Zhang   c->rows        = rows;
1941f99a636bSHong Zhang   c->den2sp      = den2sp;
1942d2853540SHong Zhang   c->colorforcol = colorforcol;
1943d2853540SHong Zhang   c->columns     = columns;
19442205254eSKarl Rupp 
19459566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
19463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1947b1683b59SHong Zhang }
1948d013fc79Sandi selinger 
1949d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1950d71ae5a4SJacob Faibussowitsch {
19514222ddf1SHong Zhang   Mat_Product *product = C->product;
19524222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19534222ddf1SHong Zhang 
1954df97dc6dSFande Kong   PetscFunctionBegin;
19554222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19564222ddf1SHong Zhang     /* Alg: "outerproduct" */
19579566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
19584222ddf1SHong Zhang   } else {
19594222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19606718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19614222ddf1SHong Zhang 
196228b400f6SJacob Faibussowitsch     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
19631cdffd5eSHong Zhang     if (atb->At) {
19641cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
19651cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19661cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A, atb->At));
19677fb60732SBarry Smith       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
19684222ddf1SHong Zhang     }
19697fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
19704222ddf1SHong Zhang   }
19713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1972df97dc6dSFande Kong }
1973df97dc6dSFande Kong 
1974d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1975d71ae5a4SJacob Faibussowitsch {
19764222ddf1SHong Zhang   Mat_Product *product = C->product;
19774222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19784222ddf1SHong Zhang   PetscReal    fill = product->fill;
1979d013fc79Sandi selinger 
1980d013fc79Sandi selinger   PetscFunctionBegin;
19819566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
19822869b61bSandi selinger 
19834222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
19852869b61bSandi selinger }
1986d013fc79Sandi selinger 
1987d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1988d71ae5a4SJacob Faibussowitsch {
19894222ddf1SHong Zhang   Mat_Product *product = C->product;
19904222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm */
19914222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
19924222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19934222ddf1SHong Zhang   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
19944222ddf1SHong Zhang   PetscInt    nalg        = 7;
19954222ddf1SHong Zhang #else
19964222ddf1SHong Zhang   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
19974222ddf1SHong Zhang   PetscInt    nalg        = 8;
19984222ddf1SHong Zhang #endif
19994222ddf1SHong Zhang 
20004222ddf1SHong Zhang   PetscFunctionBegin;
20014222ddf1SHong Zhang   /* Set default algorithm */
20029566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
200348a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2004d013fc79Sandi selinger 
20054222ddf1SHong Zhang   /* Get runtime option */
20064222ddf1SHong Zhang   if (product->api_user) {
2007d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
20089566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2009d0609cedSBarry Smith     PetscOptionsEnd();
20104222ddf1SHong Zhang   } else {
2011d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
20129566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2013d0609cedSBarry Smith     PetscOptionsEnd();
2014d013fc79Sandi selinger   }
201548a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2016d013fc79Sandi selinger 
20174222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20184222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20204222ddf1SHong Zhang }
2021d013fc79Sandi selinger 
2022d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2023d71ae5a4SJacob Faibussowitsch {
20244222ddf1SHong Zhang   Mat_Product *product     = C->product;
20254222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20264222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
2027089a957eSStefano Zampini   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2028089a957eSStefano Zampini   PetscInt     nalg        = 3;
2029d013fc79Sandi selinger 
20304222ddf1SHong Zhang   PetscFunctionBegin;
20314222ddf1SHong Zhang   /* Get runtime option */
20324222ddf1SHong Zhang   if (product->api_user) {
2033d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
20349566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2035d0609cedSBarry Smith     PetscOptionsEnd();
20364222ddf1SHong Zhang   } else {
2037d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
20389566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2039d0609cedSBarry Smith     PetscOptionsEnd();
20404222ddf1SHong Zhang   }
204148a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2042d013fc79Sandi selinger 
20434222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20454222ddf1SHong Zhang }
20464222ddf1SHong Zhang 
2047d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2048d71ae5a4SJacob Faibussowitsch {
20494222ddf1SHong Zhang   Mat_Product *product     = C->product;
20504222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20514222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20524222ddf1SHong Zhang   const char  *algTypes[2] = {"default", "color"};
20534222ddf1SHong Zhang   PetscInt     nalg        = 2;
20544222ddf1SHong Zhang 
20554222ddf1SHong Zhang   PetscFunctionBegin;
20564222ddf1SHong Zhang   /* Set default algorithm */
20579566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
20584222ddf1SHong Zhang   if (!flg) {
20594222ddf1SHong Zhang     alg = 1;
20609566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20614222ddf1SHong Zhang   }
20624222ddf1SHong Zhang 
20634222ddf1SHong Zhang   /* Get runtime option */
20644222ddf1SHong Zhang   if (product->api_user) {
2065d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
20669566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2067d0609cedSBarry Smith     PetscOptionsEnd();
20684222ddf1SHong Zhang   } else {
2069d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
20709566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2071d0609cedSBarry Smith     PetscOptionsEnd();
20724222ddf1SHong Zhang   }
207348a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20744222ddf1SHong Zhang 
20754222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20764222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20784222ddf1SHong Zhang }
20794222ddf1SHong Zhang 
2080d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2081d71ae5a4SJacob Faibussowitsch {
20824222ddf1SHong Zhang   Mat_Product *product = C->product;
20834222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
20844222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
20854222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20864222ddf1SHong Zhang   const char *algTypes[2] = {"scalable", "rap"};
20874222ddf1SHong Zhang   PetscInt    nalg        = 2;
20884222ddf1SHong Zhang #else
20894222ddf1SHong Zhang   const char *algTypes[3] = {"scalable", "rap", "hypre"};
20904222ddf1SHong Zhang   PetscInt    nalg        = 3;
20914222ddf1SHong Zhang #endif
20924222ddf1SHong Zhang 
20934222ddf1SHong Zhang   PetscFunctionBegin;
20944222ddf1SHong Zhang   /* Set default algorithm */
20959566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
209648a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20974222ddf1SHong Zhang 
20984222ddf1SHong Zhang   /* Get runtime option */
20994222ddf1SHong Zhang   if (product->api_user) {
2100d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
21019566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2102d0609cedSBarry Smith     PetscOptionsEnd();
21034222ddf1SHong Zhang   } else {
2104d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
21059566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2106d0609cedSBarry Smith     PetscOptionsEnd();
21074222ddf1SHong Zhang   }
210848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21094222ddf1SHong Zhang 
21104222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21124222ddf1SHong Zhang }
21134222ddf1SHong Zhang 
2114d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2115d71ae5a4SJacob Faibussowitsch {
21164222ddf1SHong Zhang   Mat_Product *product     = C->product;
21174222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21184222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21194222ddf1SHong Zhang   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
21204222ddf1SHong Zhang   PetscInt     nalg        = 3;
21214222ddf1SHong Zhang 
21224222ddf1SHong Zhang   PetscFunctionBegin;
21234222ddf1SHong Zhang   /* Set default algorithm */
21249566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
212548a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21264222ddf1SHong Zhang 
21274222ddf1SHong Zhang   /* Get runtime option */
21284222ddf1SHong Zhang   if (product->api_user) {
2129d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
21309566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2131d0609cedSBarry Smith     PetscOptionsEnd();
21324222ddf1SHong Zhang   } else {
2133d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
21349566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2135d0609cedSBarry Smith     PetscOptionsEnd();
21364222ddf1SHong Zhang   }
213748a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21384222ddf1SHong Zhang 
21394222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21414222ddf1SHong Zhang }
21424222ddf1SHong Zhang 
21434222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2144d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2145d71ae5a4SJacob Faibussowitsch {
21464222ddf1SHong Zhang   Mat_Product *product     = C->product;
21474222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21484222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21494222ddf1SHong Zhang   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
21504222ddf1SHong Zhang   PetscInt     nalg        = 7;
21514222ddf1SHong Zhang 
21524222ddf1SHong Zhang   PetscFunctionBegin;
21534222ddf1SHong Zhang   /* Set default algorithm */
21549566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
215548a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21564222ddf1SHong Zhang 
21574222ddf1SHong Zhang   /* Get runtime option */
21584222ddf1SHong Zhang   if (product->api_user) {
2159d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
21609566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2161d0609cedSBarry Smith     PetscOptionsEnd();
21624222ddf1SHong Zhang   } else {
2163d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
21649566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2165d0609cedSBarry Smith     PetscOptionsEnd();
21664222ddf1SHong Zhang   }
216748a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21684222ddf1SHong Zhang 
21694222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21704222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21724222ddf1SHong Zhang }
21734222ddf1SHong Zhang 
2174d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2175d71ae5a4SJacob Faibussowitsch {
21764222ddf1SHong Zhang   Mat_Product *product = C->product;
21774222ddf1SHong Zhang 
21784222ddf1SHong Zhang   PetscFunctionBegin;
21794222ddf1SHong Zhang   switch (product->type) {
2180d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
2181d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2182d71ae5a4SJacob Faibussowitsch     break;
2183d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
2184d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2185d71ae5a4SJacob Faibussowitsch     break;
2186d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
2187d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2188d71ae5a4SJacob Faibussowitsch     break;
2189d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_PtAP:
2190d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2191d71ae5a4SJacob Faibussowitsch     break;
2192d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_RARt:
2193d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2194d71ae5a4SJacob Faibussowitsch     break;
2195d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABC:
2196d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2197d71ae5a4SJacob Faibussowitsch     break;
2198d71ae5a4SJacob Faibussowitsch   default:
2199d71ae5a4SJacob Faibussowitsch     break;
22004222ddf1SHong Zhang   }
22013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2202d013fc79Sandi selinger }
2203