xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision eec179cf895b6fbcd6a0b58694319392b06e361b)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
14d71ae5a4SJacob Faibussowitsch {
15df97dc6dSFande Kong   PetscFunctionBegin;
16dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
17dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
18df97dc6dSFande Kong   PetscFunctionReturn(0);
19df97dc6dSFande Kong }
20df97dc6dSFande Kong 
214222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
22d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
23d71ae5a4SJacob Faibussowitsch {
244222ddf1SHong Zhang   PetscInt    ii;
254222ddf1SHong Zhang   Mat_SeqAIJ *aij;
26cbc6b225SStefano Zampini   PetscBool   isseqaij, osingle, ofree_a, ofree_ij;
275c66b693SKris Buschelman 
285c66b693SKris Buschelman   PetscFunctionBegin;
29aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
309566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat, m, n, m, n));
314222ddf1SHong Zhang 
32e4e71118SStefano Zampini   if (!mtype) {
339566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
349566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
35e4e71118SStefano Zampini   } else {
369566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat, mtype));
37e4e71118SStefano Zampini   }
38cbc6b225SStefano Zampini 
394222ddf1SHong Zhang   aij      = (Mat_SeqAIJ *)(mat)->data;
40cbc6b225SStefano Zampini   osingle  = aij->singlemalloc;
41cbc6b225SStefano Zampini   ofree_a  = aij->free_a;
42cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
43cbc6b225SStefano Zampini   /* changes the free flags */
449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
45cbc6b225SStefano Zampini 
469566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
479566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->imax));
499566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->ilen));
50cbc6b225SStefano Zampini   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
51cbc6b225SStefano Zampini     const PetscInt rnz = i[ii + 1] - i[ii];
52cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
53cbc6b225SStefano Zampini     aij->rmax     = PetscMax(aij->rmax, rnz);
54cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
55cbc6b225SStefano Zampini   }
56cbc6b225SStefano Zampini   aij->maxnz = i[m];
57cbc6b225SStefano Zampini   aij->nz    = i[m];
584222ddf1SHong Zhang 
59cbc6b225SStefano Zampini   if (osingle) {
609566063dSJacob Faibussowitsch     PetscCall(PetscFree3(aij->a, aij->j, aij->i));
61cbc6b225SStefano Zampini   } else {
629566063dSJacob Faibussowitsch     if (ofree_a) PetscCall(PetscFree(aij->a));
639566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->j));
649566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->i));
65cbc6b225SStefano Zampini   }
664222ddf1SHong Zhang   aij->i     = i;
674222ddf1SHong Zhang   aij->j     = j;
684222ddf1SHong Zhang   aij->a     = a;
694222ddf1SHong Zhang   aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
70cbc6b225SStefano Zampini   /* default to not retain ownership */
71cbc6b225SStefano Zampini   aij->singlemalloc = PETSC_FALSE;
724222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
734222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
749566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
755c66b693SKris Buschelman   PetscFunctionReturn(0);
765c66b693SKris Buschelman }
771c24bd37SHong Zhang 
78d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
79d71ae5a4SJacob Faibussowitsch {
804222ddf1SHong Zhang   Mat_Product        *product = C->product;
814222ddf1SHong Zhang   MatProductAlgorithm alg;
824222ddf1SHong Zhang   PetscBool           flg;
834222ddf1SHong Zhang 
844222ddf1SHong Zhang   PetscFunctionBegin;
854222ddf1SHong Zhang   if (product) {
864222ddf1SHong Zhang     alg = product->alg;
874222ddf1SHong Zhang   } else {
884222ddf1SHong Zhang     alg = "sorted";
894222ddf1SHong Zhang   }
904222ddf1SHong Zhang   /* sorted */
919566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "sorted", &flg));
924222ddf1SHong Zhang   if (flg) {
939566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
944222ddf1SHong Zhang     PetscFunctionReturn(0);
954222ddf1SHong Zhang   }
964222ddf1SHong Zhang 
974222ddf1SHong Zhang   /* scalable */
989566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
994222ddf1SHong Zhang   if (flg) {
1009566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
1014222ddf1SHong Zhang     PetscFunctionReturn(0);
1024222ddf1SHong Zhang   }
1034222ddf1SHong Zhang 
1044222ddf1SHong Zhang   /* scalable_fast */
1059566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
1064222ddf1SHong Zhang   if (flg) {
1079566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
1084222ddf1SHong Zhang     PetscFunctionReturn(0);
1094222ddf1SHong Zhang   }
1104222ddf1SHong Zhang 
1114222ddf1SHong Zhang   /* heap */
1129566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "heap", &flg));
1134222ddf1SHong Zhang   if (flg) {
1149566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
1154222ddf1SHong Zhang     PetscFunctionReturn(0);
1164222ddf1SHong Zhang   }
1174222ddf1SHong Zhang 
1184222ddf1SHong Zhang   /* btheap */
1199566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "btheap", &flg));
1204222ddf1SHong Zhang   if (flg) {
1219566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
1224222ddf1SHong Zhang     PetscFunctionReturn(0);
1234222ddf1SHong Zhang   }
1244222ddf1SHong Zhang 
1254222ddf1SHong Zhang   /* llcondensed */
1269566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
1274222ddf1SHong Zhang   if (flg) {
1289566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
1294222ddf1SHong Zhang     PetscFunctionReturn(0);
1304222ddf1SHong Zhang   }
1314222ddf1SHong Zhang 
1324222ddf1SHong Zhang   /* rowmerge */
1339566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
1344222ddf1SHong Zhang   if (flg) {
1359566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
1364222ddf1SHong Zhang     PetscFunctionReturn(0);
1374222ddf1SHong Zhang   }
1384222ddf1SHong Zhang 
1394222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1409566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
1414222ddf1SHong Zhang   if (flg) {
1429566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
1434222ddf1SHong Zhang     PetscFunctionReturn(0);
1444222ddf1SHong Zhang   }
1454222ddf1SHong Zhang #endif
1464222ddf1SHong Zhang 
1474222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1484222ddf1SHong Zhang }
1494222ddf1SHong Zhang 
150d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
151d71ae5a4SJacob Faibussowitsch {
152b561aa9dSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
153971236abSHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
154c1ba5575SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
155b561aa9dSHong Zhang   PetscReal          afill;
156eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
157*eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
158fb908031SHong Zhang   PetscBT            lnkbt;
1590298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
160b561aa9dSHong Zhang 
161b561aa9dSHong Zhang   PetscFunctionBegin;
162bd958071SHong Zhang   /* Get ci and cj */
163bd958071SHong Zhang   /*---------------*/
164fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
165fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
167fb908031SHong Zhang   ci[0] = 0;
168fb908031SHong Zhang 
169fb908031SHong Zhang   /* create and initialize a linked list */
170*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
171c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
172*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
173*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
174eca6b86aSHong Zhang 
1759566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
176fb908031SHong Zhang 
177fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1789566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
1792205254eSKarl Rupp 
180fb908031SHong Zhang   current_space = free_space;
181fb908031SHong Zhang 
182fb908031SHong Zhang   /* Determine ci and cj */
183fb908031SHong Zhang   for (i = 0; i < am; i++) {
184fb908031SHong Zhang     anzi = ai[i + 1] - ai[i];
185fb908031SHong Zhang     aj   = a->j + ai[i];
186fb908031SHong Zhang     for (j = 0; j < anzi; j++) {
187fb908031SHong Zhang       brow = aj[j];
188fb908031SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
189fb908031SHong Zhang       bj   = b->j + bi[brow];
190fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1919566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
192fb908031SHong Zhang     }
1938c78258cSHong Zhang     /* add possible missing diagonal entry */
19448a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
195fb908031SHong Zhang     cnzi = lnk[0];
196fb908031SHong Zhang 
197fb908031SHong Zhang     /* If free space is not available, make more free space */
198fb908031SHong Zhang     /* Double the amount of total space in the list */
199fb908031SHong Zhang     if (current_space->local_remaining < cnzi) {
2009566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
201fb908031SHong Zhang       ndouble++;
202fb908031SHong Zhang     }
203fb908031SHong Zhang 
204fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
2059566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
2062205254eSKarl Rupp 
207fb908031SHong Zhang     current_space->array += cnzi;
208fb908031SHong Zhang     current_space->local_used += cnzi;
209fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2102205254eSKarl Rupp 
211fb908031SHong Zhang     ci[i + 1] = ci[i] + cnzi;
212fb908031SHong Zhang   }
213fb908031SHong Zhang 
214fb908031SHong Zhang   /* Column indices are in the list of free space */
215fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
216fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2179566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
2189566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
2199566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
220b561aa9dSHong Zhang 
22126be0446SHong Zhang   /* put together the new symbolic matrix */
2229566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
2239566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
22458c24d83SHong Zhang 
22558c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
22658c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2274222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
228aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
229e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
23058c24d83SHong Zhang   c->nonew   = 0;
2314222ddf1SHong Zhang 
2324222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2334222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2340b7e3e3dSHong Zhang 
2357212da7cSHong Zhang   /* set MatInfo */
2367212da7cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
237f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2384222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2394222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2404222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2417212da7cSHong Zhang 
2427212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2437212da7cSHong Zhang   if (ci[am]) {
2449566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
2459566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
246f2b054eeSHong Zhang   } else {
2479566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
248be0fcf8dSHong Zhang   }
249f2b054eeSHong Zhang #endif
25058c24d83SHong Zhang   PetscFunctionReturn(0);
25158c24d83SHong Zhang }
252d50806bdSBarry Smith 
253d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
254d71ae5a4SJacob Faibussowitsch {
255d13dce4bSSatish Balay   PetscLogDouble     flops = 0.0;
256d50806bdSBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
257d50806bdSBarry Smith   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
258d50806bdSBarry Smith   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
25938baddfdSBarry Smith   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
260c58ca2e3SHong Zhang   PetscInt           am = A->rmap->n, cm = C->rmap->n;
261519eb980SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
2622e5835c6SStefano Zampini   PetscScalar       *ca, valtmp;
263aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2646718818eSStefano Zampini   PetscContainer     cab_dense;
2652e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
266d50806bdSBarry Smith 
267d50806bdSBarry Smith   PetscFunctionBegin;
2689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
2699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
270aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2719566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
272aa1aec99SHong Zhang     c->a      = ca;
273aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2746718818eSStefano Zampini   } else ca = c->a;
2756718818eSStefano Zampini 
2766718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2776718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2786718818eSStefano Zampini      that is hard to eradicate) */
2799566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
2806718818eSStefano Zampini   if (!cab_dense) {
2819566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
2829566063dSJacob Faibussowitsch     PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense));
2839566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetPointer(cab_dense, ab_dense));
2849566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault));
2859566063dSJacob Faibussowitsch     PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense));
2869566063dSJacob Faibussowitsch     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
287d90d86d0SMatthew G. Knepley   }
2889566063dSJacob Faibussowitsch   PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
2899566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
290aa1aec99SHong Zhang 
291c124e916SHong Zhang   /* clean old values in C */
2929566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
293d50806bdSBarry Smith   /* Traverse A row-wise. */
294d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
295d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
296d50806bdSBarry Smith   for (i = 0; i < am; i++) {
297d50806bdSBarry Smith     anzi = ai[i + 1] - ai[i];
298d50806bdSBarry Smith     for (j = 0; j < anzi; j++) {
299519eb980SHong Zhang       brow = aj[j];
300d50806bdSBarry Smith       bnzi = bi[brow + 1] - bi[brow];
301d50806bdSBarry Smith       bjj  = bj + bi[brow];
302d50806bdSBarry Smith       baj  = ba + bi[brow];
303519eb980SHong Zhang       /* perform dense axpy */
30436ec6d2dSHong Zhang       valtmp = aa[j];
305ad540459SPierre Jolivet       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
306519eb980SHong Zhang       flops += 2 * bnzi;
307519eb980SHong Zhang     }
3089371c9d4SSatish Balay     aj += anzi;
3099371c9d4SSatish Balay     aa += anzi;
310c58ca2e3SHong Zhang 
311c58ca2e3SHong Zhang     cnzi = ci[i + 1] - ci[i];
312519eb980SHong Zhang     for (k = 0; k < cnzi; k++) {
313519eb980SHong Zhang       ca[k] += ab_dense[cj[k]];
314519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
315519eb980SHong Zhang     }
316519eb980SHong Zhang     flops += cnzi;
3179371c9d4SSatish Balay     cj += cnzi;
3189371c9d4SSatish Balay     ca += cnzi;
319519eb980SHong Zhang   }
3202e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3212e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3222e5835c6SStefano Zampini #endif
3239566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3249566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3259566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
328c58ca2e3SHong Zhang   PetscFunctionReturn(0);
329c58ca2e3SHong Zhang }
330c58ca2e3SHong Zhang 
331d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
332d71ae5a4SJacob Faibussowitsch {
333c58ca2e3SHong Zhang   PetscLogDouble     flops = 0.0;
334c58ca2e3SHong Zhang   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
335c58ca2e3SHong Zhang   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
336c58ca2e3SHong Zhang   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
337c58ca2e3SHong Zhang   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
338c58ca2e3SHong Zhang   PetscInt           am = A->rmap->N, cm = C->rmap->N;
339c58ca2e3SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
3402e5835c6SStefano Zampini   PetscScalar       *ca = c->a, valtmp;
3412e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
342c58ca2e3SHong Zhang   PetscInt           nextb;
343c58ca2e3SHong Zhang 
344c58ca2e3SHong Zhang   PetscFunctionBegin;
3459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
3469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
347cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3489566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
349cf742fccSHong Zhang     c->a      = ca;
350cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
351cf742fccSHong Zhang   }
352cf742fccSHong Zhang 
353c58ca2e3SHong Zhang   /* clean old values in C */
3549566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
355c58ca2e3SHong Zhang   /* Traverse A row-wise. */
356c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
357c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
358519eb980SHong Zhang   for (i = 0; i < am; i++) {
359519eb980SHong Zhang     anzi = ai[i + 1] - ai[i];
360519eb980SHong Zhang     cnzi = ci[i + 1] - ci[i];
361519eb980SHong Zhang     for (j = 0; j < anzi; j++) {
362519eb980SHong Zhang       brow = aj[j];
363519eb980SHong Zhang       bnzi = bi[brow + 1] - bi[brow];
364519eb980SHong Zhang       bjj  = bj + bi[brow];
365519eb980SHong Zhang       baj  = ba + bi[brow];
366519eb980SHong Zhang       /* perform sparse axpy */
36736ec6d2dSHong Zhang       valtmp = aa[j];
368c124e916SHong Zhang       nextb  = 0;
369c124e916SHong Zhang       for (k = 0; nextb < bnzi; k++) {
370c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
37136ec6d2dSHong Zhang           ca[k] += valtmp * baj[nextb++];
372c124e916SHong Zhang         }
373d50806bdSBarry Smith       }
374d50806bdSBarry Smith       flops += 2 * bnzi;
375d50806bdSBarry Smith     }
3769371c9d4SSatish Balay     aj += anzi;
3779371c9d4SSatish Balay     aa += anzi;
3789371c9d4SSatish Balay     cj += cnzi;
3799371c9d4SSatish Balay     ca += cnzi;
380d50806bdSBarry Smith   }
3812e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3822e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3832e5835c6SStefano Zampini #endif
3849566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3859566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3869566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3889566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
389d50806bdSBarry Smith   PetscFunctionReturn(0);
390d50806bdSBarry Smith }
391bc011b1eSHong Zhang 
392d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
393d71ae5a4SJacob Faibussowitsch {
39425296bd5SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
39525296bd5SBarry Smith   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
3963c50cad2SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
39725296bd5SBarry Smith   MatScalar         *ca;
39825296bd5SBarry Smith   PetscReal          afill;
399eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
400*eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
4010298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
40225296bd5SBarry Smith 
40325296bd5SBarry Smith   PetscFunctionBegin;
4043c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
4053c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
4063c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4079566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
4083c50cad2SHong Zhang   ci[0] = 0;
4093c50cad2SHong Zhang 
4103c50cad2SHong Zhang   /* create and initialize a linked list */
411*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
412c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
413*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
414*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
415eca6b86aSHong Zhang 
4169566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
4173c50cad2SHong Zhang 
4183c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4199566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
4203c50cad2SHong Zhang   current_space = free_space;
4213c50cad2SHong Zhang 
4223c50cad2SHong Zhang   /* Determine ci and cj */
4233c50cad2SHong Zhang   for (i = 0; i < am; i++) {
4243c50cad2SHong Zhang     anzi = ai[i + 1] - ai[i];
4253c50cad2SHong Zhang     aj   = a->j + ai[i];
4263c50cad2SHong Zhang     for (j = 0; j < anzi; j++) {
4273c50cad2SHong Zhang       brow = aj[j];
4283c50cad2SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
4293c50cad2SHong Zhang       bj   = b->j + bi[brow];
4303c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4319566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
4323c50cad2SHong Zhang     }
4338c78258cSHong Zhang     /* add possible missing diagonal entry */
43448a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
4353c50cad2SHong Zhang     cnzi = lnk[1];
4363c50cad2SHong Zhang 
4373c50cad2SHong Zhang     /* If free space is not available, make more free space */
4383c50cad2SHong Zhang     /* Double the amount of total space in the list */
4393c50cad2SHong Zhang     if (current_space->local_remaining < cnzi) {
4409566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
4413c50cad2SHong Zhang       ndouble++;
4423c50cad2SHong Zhang     }
4433c50cad2SHong Zhang 
4443c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4459566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
4462205254eSKarl Rupp 
4473c50cad2SHong Zhang     current_space->array += cnzi;
4483c50cad2SHong Zhang     current_space->local_used += cnzi;
4493c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4502205254eSKarl Rupp 
4513c50cad2SHong Zhang     ci[i + 1] = ci[i] + cnzi;
4523c50cad2SHong Zhang   }
4533c50cad2SHong Zhang 
4543c50cad2SHong Zhang   /* Column indices are in the list of free space */
4553c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4563c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
4589566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
4599566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
46025296bd5SBarry Smith 
46125296bd5SBarry Smith   /* Allocate space for ca */
4629566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
46325296bd5SBarry Smith 
46425296bd5SBarry Smith   /* put together the new symbolic matrix */
4659566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
4669566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
46725296bd5SBarry Smith 
46825296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
46925296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4704222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
47125296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
47225296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
47325296bd5SBarry Smith   c->nonew   = 0;
4742205254eSKarl Rupp 
4754222ddf1SHong Zhang   /* slower, less memory */
4764222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47725296bd5SBarry Smith 
47825296bd5SBarry Smith   /* set MatInfo */
47925296bd5SBarry Smith   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
48025296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4814222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4824222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4834222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
48425296bd5SBarry Smith 
48525296bd5SBarry Smith #if defined(PETSC_USE_INFO)
48625296bd5SBarry Smith   if (ci[am]) {
4879566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
4889566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
48925296bd5SBarry Smith   } else {
4909566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
49125296bd5SBarry Smith   }
49225296bd5SBarry Smith #endif
49325296bd5SBarry Smith   PetscFunctionReturn(0);
49425296bd5SBarry Smith }
49525296bd5SBarry Smith 
496d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
497d71ae5a4SJacob Faibussowitsch {
498e9e4536cSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
499bf9555e6SHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
50025c41797SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
501e9e4536cSHong Zhang   MatScalar         *ca;
502e9e4536cSHong Zhang   PetscReal          afill;
503eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
504*eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
5050298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
506e9e4536cSHong Zhang 
507e9e4536cSHong Zhang   PetscFunctionBegin;
508bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
509bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
510bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
5119566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
512bd958071SHong Zhang   ci[0] = 0;
513bd958071SHong Zhang 
514bd958071SHong Zhang   /* create and initialize a linked list */
515*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
516c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
517*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
518*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
5199566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
520bd958071SHong Zhang 
521bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5229566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
523bd958071SHong Zhang   current_space = free_space;
524bd958071SHong Zhang 
525bd958071SHong Zhang   /* Determine ci and cj */
526bd958071SHong Zhang   for (i = 0; i < am; i++) {
527bd958071SHong Zhang     anzi = ai[i + 1] - ai[i];
528bd958071SHong Zhang     aj   = a->j + ai[i];
529bd958071SHong Zhang     for (j = 0; j < anzi; j++) {
530bd958071SHong Zhang       brow = aj[j];
531bd958071SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
532bd958071SHong Zhang       bj   = b->j + bi[brow];
533bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5349566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
535bd958071SHong Zhang     }
5368c78258cSHong Zhang     /* add possible missing diagonal entry */
53748a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
5388c78258cSHong Zhang 
539bd958071SHong Zhang     cnzi = lnk[0];
540bd958071SHong Zhang 
541bd958071SHong Zhang     /* If free space is not available, make more free space */
542bd958071SHong Zhang     /* Double the amount of total space in the list */
543bd958071SHong Zhang     if (current_space->local_remaining < cnzi) {
5449566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
545bd958071SHong Zhang       ndouble++;
546bd958071SHong Zhang     }
547bd958071SHong Zhang 
548bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5499566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
5502205254eSKarl Rupp 
551bd958071SHong Zhang     current_space->array += cnzi;
552bd958071SHong Zhang     current_space->local_used += cnzi;
553bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5542205254eSKarl Rupp 
555bd958071SHong Zhang     ci[i + 1] = ci[i] + cnzi;
556bd958071SHong Zhang   }
557bd958071SHong Zhang 
558bd958071SHong Zhang   /* Column indices are in the list of free space */
559bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
560bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5619566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
5629566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
5639566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
564e9e4536cSHong Zhang 
565e9e4536cSHong Zhang   /* Allocate space for ca */
566bd958071SHong Zhang   /*-----------------------*/
5679566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
568e9e4536cSHong Zhang 
569e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5709566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
5719566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
572e9e4536cSHong Zhang 
573e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
574e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5754222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
576e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
577e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
578e9e4536cSHong Zhang   c->nonew   = 0;
5792205254eSKarl Rupp 
5804222ddf1SHong Zhang   /* slower, less memory */
5814222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
582e9e4536cSHong Zhang 
583e9e4536cSHong Zhang   /* set MatInfo */
584e9e4536cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
585e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5864222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5874222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5884222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
589e9e4536cSHong Zhang 
590e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
591e9e4536cSHong Zhang   if (ci[am]) {
5929566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
5939566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
594e9e4536cSHong Zhang   } else {
5959566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
596e9e4536cSHong Zhang   }
597e9e4536cSHong Zhang #endif
598e9e4536cSHong Zhang   PetscFunctionReturn(0);
599e9e4536cSHong Zhang }
600e9e4536cSHong Zhang 
601d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
602d71ae5a4SJacob Faibussowitsch {
6030ced3a2bSJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
6040ced3a2bSJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
6050ced3a2bSJed Brown   PetscInt          *ci, *cj, *bb;
6060ced3a2bSJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
6070ced3a2bSJed Brown   PetscReal          afill;
6080ced3a2bSJed Brown   PetscInt           i, j, col, ndouble = 0;
6090298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
6100ced3a2bSJed Brown   PetscHeap          h;
6110ced3a2bSJed Brown 
6120ced3a2bSJed Brown   PetscFunctionBegin;
613cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6140ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
6150ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6169566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
6170ced3a2bSJed Brown   ci[0] = 0;
6180ced3a2bSJed Brown 
6190ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6209566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
6210ced3a2bSJed Brown   current_space = free_space;
6220ced3a2bSJed Brown 
6239566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
6249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
6250ced3a2bSJed Brown 
6260ced3a2bSJed Brown   /* Determine ci and cj */
6270ced3a2bSJed Brown   for (i = 0; i < am; i++) {
6280ced3a2bSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6290ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
6300ced3a2bSJed Brown     ci[i + 1]            = ci[i];
6310ced3a2bSJed Brown     /* Populate the min heap */
6320ced3a2bSJed Brown     for (j = 0; j < anzi; j++) {
6330ced3a2bSJed Brown       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
6340ced3a2bSJed Brown       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
6359566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
6360ced3a2bSJed Brown       }
6370ced3a2bSJed Brown     }
6380ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6399566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
6400ced3a2bSJed Brown     while (j >= 0) {
6410ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6429566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
6430ced3a2bSJed Brown         ndouble++;
6440ced3a2bSJed Brown       }
6450ced3a2bSJed Brown       *(current_space->array++) = col;
6460ced3a2bSJed Brown       current_space->local_used++;
6470ced3a2bSJed Brown       current_space->local_remaining--;
6480ced3a2bSJed Brown       ci[i + 1]++;
6490ced3a2bSJed Brown 
6500ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6519566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
6520ced3a2bSJed Brown       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
6530ced3a2bSJed Brown         PetscInt j2, col2;
6549566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h, &j2, &col2));
6550ced3a2bSJed Brown         if (col2 != col) break;
6569566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h, &j2, &col2));
6579566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
6580ced3a2bSJed Brown       }
6590ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6609566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6619566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
6620ced3a2bSJed Brown     }
6630ced3a2bSJed Brown   }
6649566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6659566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6660ced3a2bSJed Brown 
6670ced3a2bSJed Brown   /* Column indices are in the list of free space */
6680ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6690ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
6719566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
6720ced3a2bSJed Brown 
6730ced3a2bSJed Brown   /* put together the new symbolic matrix */
6749566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
6759566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6760ced3a2bSJed Brown 
6770ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6780ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6794222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
6800ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6810ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6820ced3a2bSJed Brown   c->nonew   = 0;
68326fbe8dcSKarl Rupp 
6844222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6850ced3a2bSJed Brown 
6860ced3a2bSJed Brown   /* set MatInfo */
6870ced3a2bSJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
6880ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6894222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6904222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6914222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6920ced3a2bSJed Brown 
6930ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6940ced3a2bSJed Brown   if (ci[am]) {
6959566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
6969566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
6970ced3a2bSJed Brown   } else {
6989566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
6990ced3a2bSJed Brown   }
7000ced3a2bSJed Brown #endif
7010ced3a2bSJed Brown   PetscFunctionReturn(0);
7020ced3a2bSJed Brown }
703e9e4536cSHong Zhang 
704d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
705d71ae5a4SJacob Faibussowitsch {
7068a07c6f1SJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
7078a07c6f1SJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
7088a07c6f1SJed Brown   PetscInt          *ci, *cj, *bb;
7098a07c6f1SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
7108a07c6f1SJed Brown   PetscReal          afill;
7118a07c6f1SJed Brown   PetscInt           i, j, col, ndouble = 0;
7120298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
7138a07c6f1SJed Brown   PetscHeap          h;
7148a07c6f1SJed Brown   PetscBT            bt;
7158a07c6f1SJed Brown 
7168a07c6f1SJed Brown   PetscFunctionBegin;
717cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7188a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
7198a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7209566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
7218a07c6f1SJed Brown   ci[0] = 0;
7228a07c6f1SJed Brown 
7238a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7249566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
7252205254eSKarl Rupp 
7268a07c6f1SJed Brown   current_space = free_space;
7278a07c6f1SJed Brown 
7289566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
7299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
7309566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn, &bt));
7318a07c6f1SJed Brown 
7328a07c6f1SJed Brown   /* Determine ci and cj */
7338a07c6f1SJed Brown   for (i = 0; i < am; i++) {
7348a07c6f1SJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7358a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
7368a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7378a07c6f1SJed Brown     ci[i + 1]            = ci[i];
7388a07c6f1SJed Brown     /* Populate the min heap */
7398a07c6f1SJed Brown     for (j = 0; j < anzi; j++) {
7408a07c6f1SJed Brown       PetscInt brow = acol[j];
7418a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
7428a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7438a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7449566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7458a07c6f1SJed Brown           bb[j]++;
7468a07c6f1SJed Brown           break;
7478a07c6f1SJed Brown         }
7488a07c6f1SJed Brown       }
7498a07c6f1SJed Brown     }
7508a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7519566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
7528a07c6f1SJed Brown     while (j >= 0) {
7538a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7540298fd71SBarry Smith         fptr = NULL;                            /* need PetscBTMemzero */
7559566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
7568a07c6f1SJed Brown         ndouble++;
7578a07c6f1SJed Brown       }
7588a07c6f1SJed Brown       *(current_space->array++) = col;
7598a07c6f1SJed Brown       current_space->local_used++;
7608a07c6f1SJed Brown       current_space->local_remaining--;
7618a07c6f1SJed Brown       ci[i + 1]++;
7628a07c6f1SJed Brown 
7638a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7648a07c6f1SJed Brown       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
7658a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7668a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7679566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7688a07c6f1SJed Brown           bb[j]++;
7698a07c6f1SJed Brown           break;
7708a07c6f1SJed Brown         }
7718a07c6f1SJed Brown       }
7729566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
7738a07c6f1SJed Brown     }
7748a07c6f1SJed Brown     if (fptr) { /* Clear the bits for this row */
7759566063dSJacob Faibussowitsch       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
7768a07c6f1SJed Brown     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7779566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn, bt));
7788a07c6f1SJed Brown     }
7798a07c6f1SJed Brown   }
7809566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7819566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7829566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7838a07c6f1SJed Brown 
7848a07c6f1SJed Brown   /* Column indices are in the list of free space */
7858a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7868a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
7889566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
7898a07c6f1SJed Brown 
7908a07c6f1SJed Brown   /* put together the new symbolic matrix */
7919566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
7929566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
7938a07c6f1SJed Brown 
7948a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7958a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7964222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
7978a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7988a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7998a07c6f1SJed Brown   c->nonew   = 0;
80026fbe8dcSKarl Rupp 
8014222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
8028a07c6f1SJed Brown 
8038a07c6f1SJed Brown   /* set MatInfo */
8048a07c6f1SJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
8058a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
8064222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8074222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8084222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8098a07c6f1SJed Brown 
8108a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8118a07c6f1SJed Brown   if (ci[am]) {
8129566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
8139566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
8148a07c6f1SJed Brown   } else {
8159566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
8168a07c6f1SJed Brown   }
8178a07c6f1SJed Brown #endif
8188a07c6f1SJed Brown   PetscFunctionReturn(0);
8198a07c6f1SJed Brown }
8208a07c6f1SJed Brown 
821d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
822d71ae5a4SJacob Faibussowitsch {
823d7ed1a4dSandi selinger   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
824d7ed1a4dSandi selinger   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
825d7ed1a4dSandi selinger   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
826d7ed1a4dSandi selinger   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
827d7ed1a4dSandi selinger   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
828d7ed1a4dSandi selinger   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
829d7ed1a4dSandi selinger   const PetscInt *brow_ptr[8], *brow_end[8];
830d7ed1a4dSandi selinger   PetscInt        window[8];
831d7ed1a4dSandi selinger   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
832d7ed1a4dSandi selinger   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
833d7ed1a4dSandi selinger   PetscReal       afill;
834f83700f2Sandi selinger   PetscInt       *workj_L1, *workj_L2, *workj_L3;
8357660c4dbSandi selinger   PetscInt        L1_nnz, L2_nnz;
836d7ed1a4dSandi selinger 
837d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
838d7ed1a4dSandi selinger              Because of the way virtual memory works,
839d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
840d7ed1a4dSandi selinger   PetscFunctionBegin;
8419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
842d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
843d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
844d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
845d7ed1a4dSandi selinger     a_rownnz             = 0;
846d7ed1a4dSandi selinger     for (k = 0; k < anzi; ++k) {
847d7ed1a4dSandi selinger       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
848d7ed1a4dSandi selinger       if (a_rownnz > bn) {
849d7ed1a4dSandi selinger         a_rownnz = bn;
850d7ed1a4dSandi selinger         break;
851d7ed1a4dSandi selinger       }
852d7ed1a4dSandi selinger     }
853d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
854d7ed1a4dSandi selinger   }
855d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8569566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
8579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
8589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
859d7ed1a4dSandi selinger 
8607660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8617660c4dbSandi selinger   c_maxmem = 8 * (ai[am] + bi[bm]);
862d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem, &cj));
864d7ed1a4dSandi selinger 
865d7ed1a4dSandi selinger   ci_nnz      = 0;
866d7ed1a4dSandi selinger   ci[0]       = 0;
867d7ed1a4dSandi selinger   worki_L1[0] = 0;
868d7ed1a4dSandi selinger   worki_L2[0] = 0;
869d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
870d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
871d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
872d7ed1a4dSandi selinger     rowsleft             = anzi;
873d7ed1a4dSandi selinger     inputcol_L1          = acol;
874d7ed1a4dSandi selinger     L2_nnz               = 0;
8757660c4dbSandi selinger     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8767660c4dbSandi selinger     worki_L2[1]          = 0;
87708fe1b3cSKarl Rupp     outputi_nnz          = 0;
878d7ed1a4dSandi selinger 
879d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
880d7ed1a4dSandi selinger     while (ci_nnz + a_maxrownnz > c_maxmem) {
881d7ed1a4dSandi selinger       c_maxmem *= 2;
882d7ed1a4dSandi selinger       ndouble++;
8839566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
884d7ed1a4dSandi selinger     }
885d7ed1a4dSandi selinger 
886d7ed1a4dSandi selinger     while (rowsleft) {
887d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
888d7ed1a4dSandi selinger       L1_nrows    = 0;
8897660c4dbSandi selinger       L1_nnz      = 0;
890d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
891d7ed1a4dSandi selinger       inputi      = bi;
892d7ed1a4dSandi selinger       inputj      = bj;
893d7ed1a4dSandi selinger 
894d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
895d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
896f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
897d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
898d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
899d7ed1a4dSandi selinger   window_min  = bn; \
9007660c4dbSandi selinger   outputi_nnz = 0; \
9017660c4dbSandi selinger   for (k = 0; k < ANNZ; ++k) { \
902d7ed1a4dSandi selinger     brow_ptr[k] = inputj + inputi[inputcol[k]]; \
903d7ed1a4dSandi selinger     brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
904d7ed1a4dSandi selinger     window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
905d7ed1a4dSandi selinger     window_min  = PetscMin(window[k], window_min); \
906d7ed1a4dSandi selinger   } \
907d7ed1a4dSandi selinger   while (window_min < bn) { \
908d7ed1a4dSandi selinger     outputj[outputi_nnz++] = window_min; \
909d7ed1a4dSandi selinger     /* advance front and compute new minimum */ \
910d7ed1a4dSandi selinger     old_window_min = window_min; \
911d7ed1a4dSandi selinger     window_min     = bn; \
912d7ed1a4dSandi selinger     for (k = 0; k < ANNZ; ++k) { \
913d7ed1a4dSandi selinger       if (window[k] == old_window_min) { \
914d7ed1a4dSandi selinger         brow_ptr[k]++; \
915d7ed1a4dSandi selinger         window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
916d7ed1a4dSandi selinger       } \
917d7ed1a4dSandi selinger       window_min = PetscMin(window[k], window_min); \
918d7ed1a4dSandi selinger     } \
919d7ed1a4dSandi selinger   }
920d7ed1a4dSandi selinger 
921d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
922d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
923d7ed1a4dSandi selinger       while (L1_rowsleft) {
9247660c4dbSandi selinger         outputi_nnz = 0;
9257660c4dbSandi selinger         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
9267660c4dbSandi selinger         else outputj = cj + ci_nnz;                /* Merge directly to C */
9277660c4dbSandi selinger 
928d7ed1a4dSandi selinger         switch (L1_rowsleft) {
9299371c9d4SSatish Balay         case 1:
9309371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
931d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
932d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
933d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
934d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
935d7ed1a4dSandi selinger           L1_rowsleft = 0;
936d7ed1a4dSandi selinger           break;
9379371c9d4SSatish Balay         case 2:
9389371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(2);
939d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
940d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
941d7ed1a4dSandi selinger           L1_rowsleft = 0;
942d7ed1a4dSandi selinger           break;
9439371c9d4SSatish Balay         case 3:
9449371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(3);
945d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
946d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
947d7ed1a4dSandi selinger           L1_rowsleft = 0;
948d7ed1a4dSandi selinger           break;
9499371c9d4SSatish Balay         case 4:
9509371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(4);
951d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
952d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
953d7ed1a4dSandi selinger           L1_rowsleft = 0;
954d7ed1a4dSandi selinger           break;
9559371c9d4SSatish Balay         case 5:
9569371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(5);
957d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
958d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
959d7ed1a4dSandi selinger           L1_rowsleft = 0;
960d7ed1a4dSandi selinger           break;
9619371c9d4SSatish Balay         case 6:
9629371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(6);
963d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
964d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
965d7ed1a4dSandi selinger           L1_rowsleft = 0;
966d7ed1a4dSandi selinger           break;
9679371c9d4SSatish Balay         case 7:
9689371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(7);
969d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
970d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
971d7ed1a4dSandi selinger           L1_rowsleft = 0;
972d7ed1a4dSandi selinger           break;
9739371c9d4SSatish Balay         default:
9749371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(8);
975d7ed1a4dSandi selinger           inputcol += 8;
976d7ed1a4dSandi selinger           rowsleft -= 8;
977d7ed1a4dSandi selinger           L1_rowsleft -= 8;
978d7ed1a4dSandi selinger           break;
979d7ed1a4dSandi selinger         }
980d7ed1a4dSandi selinger         inputcol_L1 = inputcol;
9817660c4dbSandi selinger         L1_nnz += outputi_nnz;
9827660c4dbSandi selinger         worki_L1[++L1_nrows] = L1_nnz;
983d7ed1a4dSandi selinger       }
984d7ed1a4dSandi selinger 
985d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
986d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
987d7ed1a4dSandi selinger       if (anzi > 8) {
988d7ed1a4dSandi selinger         inputi      = worki_L1;
989d7ed1a4dSandi selinger         inputj      = workj_L1;
990d7ed1a4dSandi selinger         inputcol    = workcol;
991d7ed1a4dSandi selinger         outputi_nnz = 0;
992d7ed1a4dSandi selinger 
993d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
994d7ed1a4dSandi selinger         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
995d7ed1a4dSandi selinger 
996d7ed1a4dSandi selinger         switch (L1_nrows) {
9979371c9d4SSatish Balay         case 1:
9989371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
999d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
1000d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1001d7ed1a4dSandi selinger           break;
1002d71ae5a4SJacob Faibussowitsch         case 2:
1003d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(2);
1004d71ae5a4SJacob Faibussowitsch           break;
1005d71ae5a4SJacob Faibussowitsch         case 3:
1006d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(3);
1007d71ae5a4SJacob Faibussowitsch           break;
1008d71ae5a4SJacob Faibussowitsch         case 4:
1009d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(4);
1010d71ae5a4SJacob Faibussowitsch           break;
1011d71ae5a4SJacob Faibussowitsch         case 5:
1012d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(5);
1013d71ae5a4SJacob Faibussowitsch           break;
1014d71ae5a4SJacob Faibussowitsch         case 6:
1015d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(6);
1016d71ae5a4SJacob Faibussowitsch           break;
1017d71ae5a4SJacob Faibussowitsch         case 7:
1018d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(7);
1019d71ae5a4SJacob Faibussowitsch           break;
1020d71ae5a4SJacob Faibussowitsch         case 8:
1021d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(8);
1022d71ae5a4SJacob Faibussowitsch           break;
1023d71ae5a4SJacob Faibussowitsch         default:
1024d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1025d7ed1a4dSandi selinger         }
1026d7ed1a4dSandi selinger         L2_nnz += outputi_nnz;
1027d7ed1a4dSandi selinger         worki_L2[++L2_nrows] = L2_nnz;
1028d7ed1a4dSandi selinger 
10297660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10307660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1031d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1032d7ed1a4dSandi selinger           inputi      = worki_L2;
1033d7ed1a4dSandi selinger           inputj      = workj_L2;
1034d7ed1a4dSandi selinger           inputcol    = workcol;
1035d7ed1a4dSandi selinger           outputi_nnz = 0;
1036f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1037d7ed1a4dSandi selinger           else outputj = cj + ci_nnz;
1038d7ed1a4dSandi selinger           switch (L2_nrows) {
10399371c9d4SSatish Balay           case 1:
10409371c9d4SSatish Balay             brow_ptr[0] = inputj + inputi[inputcol[0]];
1041d7ed1a4dSandi selinger             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1042d7ed1a4dSandi selinger             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1043d7ed1a4dSandi selinger             break;
1044d71ae5a4SJacob Faibussowitsch           case 2:
1045d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(2);
1046d71ae5a4SJacob Faibussowitsch             break;
1047d71ae5a4SJacob Faibussowitsch           case 3:
1048d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(3);
1049d71ae5a4SJacob Faibussowitsch             break;
1050d71ae5a4SJacob Faibussowitsch           case 4:
1051d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(4);
1052d71ae5a4SJacob Faibussowitsch             break;
1053d71ae5a4SJacob Faibussowitsch           case 5:
1054d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(5);
1055d71ae5a4SJacob Faibussowitsch             break;
1056d71ae5a4SJacob Faibussowitsch           case 6:
1057d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(6);
1058d71ae5a4SJacob Faibussowitsch             break;
1059d71ae5a4SJacob Faibussowitsch           case 7:
1060d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(7);
1061d71ae5a4SJacob Faibussowitsch             break;
1062d71ae5a4SJacob Faibussowitsch           case 8:
1063d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(8);
1064d71ae5a4SJacob Faibussowitsch             break;
1065d71ae5a4SJacob Faibussowitsch           default:
1066d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1067d7ed1a4dSandi selinger           }
1068d7ed1a4dSandi selinger           L2_nrows    = 1;
10697660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10707660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10717660c4dbSandi selinger           /* Copy to workj_L2 */
1072d7ed1a4dSandi selinger           if (rowsleft) {
10737660c4dbSandi selinger             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1074d7ed1a4dSandi selinger           }
1075d7ed1a4dSandi selinger         }
1076d7ed1a4dSandi selinger       }
1077d7ed1a4dSandi selinger     } /* while (rowsleft) */
1078d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1079d7ed1a4dSandi selinger 
1080d7ed1a4dSandi selinger     /* terminate current row */
1081d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1082d7ed1a4dSandi selinger     ci[i + 1] = ci_nnz;
1083d7ed1a4dSandi selinger   }
1084d7ed1a4dSandi selinger 
1085d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10869566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
10879566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1088d7ed1a4dSandi selinger 
1089d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1090d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10914222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
1092d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1093d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1094d7ed1a4dSandi selinger   c->nonew   = 0;
1095d7ed1a4dSandi selinger 
10964222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1097d7ed1a4dSandi selinger 
1098d7ed1a4dSandi selinger   /* set MatInfo */
1099d7ed1a4dSandi selinger   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1100d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
11014222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11024222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11034222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1104d7ed1a4dSandi selinger 
1105d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1106d7ed1a4dSandi selinger   if (ci[am]) {
11079566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
11089566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1109d7ed1a4dSandi selinger   } else {
11109566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1111d7ed1a4dSandi selinger   }
1112d7ed1a4dSandi selinger #endif
1113d7ed1a4dSandi selinger 
1114d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
11159566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
11169566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
11179566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
1118d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1119d7ed1a4dSandi selinger }
1120d7ed1a4dSandi selinger 
1121cd093f1dSJed Brown /* concatenate unique entries and then sort */
1122d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1123d71ae5a4SJacob Faibussowitsch {
1124cd093f1dSJed Brown   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1125cd093f1dSJed Brown   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
11268c78258cSHong Zhang   PetscInt       *ci, *cj, bcol;
1127cd093f1dSJed Brown   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1128cd093f1dSJed Brown   PetscReal       afill;
1129cd093f1dSJed Brown   PetscInt        i, j, ndouble = 0;
1130cd093f1dSJed Brown   PetscSegBuffer  seg, segrow;
1131cd093f1dSJed Brown   char           *seen;
1132cd093f1dSJed Brown 
1133cd093f1dSJed Brown   PetscFunctionBegin;
11349566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
1135cd093f1dSJed Brown   ci[0] = 0;
1136cd093f1dSJed Brown 
1137cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
11389566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
11399566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
11409566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn, &seen));
1141cd093f1dSJed Brown 
1142cd093f1dSJed Brown   /* Determine ci and cj */
1143cd093f1dSJed Brown   for (i = 0; i < am; i++) {
1144cd093f1dSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1145cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
1146cd093f1dSJed Brown     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
11478c78258cSHong Zhang 
1148cd093f1dSJed Brown     /* Pack segrow */
1149cd093f1dSJed Brown     for (j = 0; j < anzi; j++) {
1150cd093f1dSJed Brown       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1151cd093f1dSJed Brown       for (k = bjstart; k < bjend; k++) {
11528c78258cSHong Zhang         bcol = bj[k];
1153cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1154cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11559566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1156cd093f1dSJed Brown           *slot      = bcol;
1157cd093f1dSJed Brown           seen[bcol] = 1;
1158cd093f1dSJed Brown           packlen++;
1159cd093f1dSJed Brown         }
1160cd093f1dSJed Brown       }
1161cd093f1dSJed Brown     }
11628c78258cSHong Zhang 
11638c78258cSHong Zhang     /* Check i-th diagonal entry */
11648c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11658c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11669566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
11678c78258cSHong Zhang       *slot   = i;
11688c78258cSHong Zhang       seen[i] = 1;
11698c78258cSHong Zhang       packlen++;
11708c78258cSHong Zhang     }
11718c78258cSHong Zhang 
11729566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
11739566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow, crow));
11749566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen, crow));
1175cd093f1dSJed Brown     ci[i + 1] = ci[i] + packlen;
1176cd093f1dSJed Brown     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1177cd093f1dSJed Brown   }
11789566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11799566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1180cd093f1dSJed Brown 
1181cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11829566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
11839566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1184cd093f1dSJed Brown 
1185cd093f1dSJed Brown   /* put together the new symbolic matrix */
11869566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
11879566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1188cd093f1dSJed Brown 
1189cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1190cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11914222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
1192cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1193cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1194cd093f1dSJed Brown   c->nonew   = 0;
1195cd093f1dSJed Brown 
11964222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1197cd093f1dSJed Brown 
1198cd093f1dSJed Brown   /* set MatInfo */
11992a09556fSStefano Zampini   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1200cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
12014222ddf1SHong Zhang   C->info.mallocs           = ndouble;
12024222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
12034222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1204cd093f1dSJed Brown 
1205cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1206cd093f1dSJed Brown   if (ci[am]) {
12079566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
12089566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1209cd093f1dSJed Brown   } else {
12109566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1211cd093f1dSJed Brown   }
1212cd093f1dSJed Brown #endif
1213cd093f1dSJed Brown   PetscFunctionReturn(0);
1214cd093f1dSJed Brown }
1215cd093f1dSJed Brown 
1216d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1217d71ae5a4SJacob Faibussowitsch {
12186718818eSStefano Zampini   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
12192128a86cSHong Zhang 
12202128a86cSHong Zhang   PetscFunctionBegin;
12219566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
12229566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
12239566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
12249566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
12252128a86cSHong Zhang   PetscFunctionReturn(0);
12262128a86cSHong Zhang }
12272128a86cSHong Zhang 
1228d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1229d71ae5a4SJacob Faibussowitsch {
123081d82fe4SHong Zhang   Mat                  Bt;
123140192850SHong Zhang   Mat_MatMatTransMult *abt;
12324222ddf1SHong Zhang   Mat_Product         *product = C->product;
12336718818eSStefano Zampini   char                *alg;
1234d2853540SHong Zhang 
123581d82fe4SHong Zhang   PetscFunctionBegin;
123628b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
123728b400f6SJacob Faibussowitsch   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
12386718818eSStefano Zampini 
123981d82fe4SHong Zhang   /* create symbolic Bt */
12407fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B, &Bt));
12419566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
12429566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
124381d82fe4SHong Zhang 
124481d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12459566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg, &alg));
12469566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
12479566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
12489566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
12499566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
125081d82fe4SHong Zhang 
1251a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12529566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12532128a86cSHong Zhang 
12546718818eSStefano Zampini   product->data    = abt;
12556718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12566718818eSStefano Zampini 
12574222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12582128a86cSHong Zhang 
12594222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12609566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
126140192850SHong Zhang   if (abt->usecoloring) {
1262b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1263b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1264335efc43SPeter Brune     MatColoring          coloring;
12652128a86cSHong Zhang     ISColoring           iscoloring;
12662128a86cSHong Zhang     Mat                  Bt_dense, C_dense;
1267e8354b3eSHong Zhang 
12684222ddf1SHong Zhang     /* inode causes memory problem */
12699566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
12704222ddf1SHong Zhang 
12719566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C, &coloring));
12729566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring, 2));
12739566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
12749566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12759566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring, &iscoloring));
12769566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12779566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
12782205254eSKarl Rupp 
127940192850SHong Zhang     abt->matcoloring = matcoloring;
12802205254eSKarl Rupp 
12819566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12822128a86cSHong Zhang 
12832128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12849566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
12859566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
12869566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
12879566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
12882205254eSKarl Rupp 
12892128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
129040192850SHong Zhang     abt->Bt_den         = Bt_dense;
12912128a86cSHong Zhang 
12929566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
12939566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
12949566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense, MATSEQDENSE));
12959566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
12962205254eSKarl Rupp 
12972128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
129840192850SHong Zhang     abt->ABt_den        = C_dense;
1299f94ccd6cSHong Zhang 
1300f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
13011ce71dffSSatish Balay     {
13024222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
13039371c9d4SSatish Balay       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
13049371c9d4SSatish Balay                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
13051ce71dffSSatish Balay     }
1306f94ccd6cSHong Zhang #endif
13072128a86cSHong Zhang   }
1308e99be685SHong Zhang   /* clean up */
13099566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
13105df89d91SHong Zhang   PetscFunctionReturn(0);
13115df89d91SHong Zhang }
13125df89d91SHong Zhang 
1313d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1314d71ae5a4SJacob Faibussowitsch {
13155df89d91SHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1316e2cac8caSJed Brown   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
131781d82fe4SHong Zhang   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
13185df89d91SHong Zhang   PetscLogDouble       flops = 0.0;
1319aa1aec99SHong Zhang   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
13206718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13216718818eSStefano Zampini   Mat_Product         *product = C->product;
13225df89d91SHong Zhang 
13235df89d91SHong Zhang   PetscFunctionBegin;
132428b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
13256718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
132628b400f6SJacob Faibussowitsch   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
132758ed3ceaSHong Zhang   /* clear old values in C */
132858ed3ceaSHong Zhang   if (!c->a) {
13299566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
133058ed3ceaSHong Zhang     c->a      = ca;
133158ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
133258ed3ceaSHong Zhang   } else {
133358ed3ceaSHong Zhang     ca = c->a;
13349566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
133558ed3ceaSHong Zhang   }
133658ed3ceaSHong Zhang 
133740192850SHong Zhang   if (abt->usecoloring) {
133840192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
133940192850SHong Zhang     Mat                  Bt_dense, C_dense = abt->ABt_den;
1340c8db22f6SHong Zhang 
1341b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
134240192850SHong Zhang     Bt_dense = abt->Bt_den;
13439566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1344c8db22f6SHong Zhang 
1345c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13469566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1347c8db22f6SHong Zhang 
13482128a86cSHong Zhang     /* Recover C from C_dense */
13499566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1350c8db22f6SHong Zhang     PetscFunctionReturn(0);
1351c8db22f6SHong Zhang   }
1352c8db22f6SHong Zhang 
13531fa1209cSHong Zhang   for (i = 0; i < cm; i++) {
135481d82fe4SHong Zhang     anzi = ai[i + 1] - ai[i];
135581d82fe4SHong Zhang     acol = aj + ai[i];
13566973769bSHong Zhang     aval = aa + ai[i];
13571fa1209cSHong Zhang     cnzi = ci[i + 1] - ci[i];
13581fa1209cSHong Zhang     ccol = cj + ci[i];
13596973769bSHong Zhang     cval = ca + ci[i];
13601fa1209cSHong Zhang     for (j = 0; j < cnzi; j++) {
136181d82fe4SHong Zhang       brow = ccol[j];
136281d82fe4SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
136381d82fe4SHong Zhang       bcol = bj + bi[brow];
13646973769bSHong Zhang       bval = ba + bi[brow];
13656973769bSHong Zhang 
136681d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
13679371c9d4SSatish Balay       nexta = 0;
13689371c9d4SSatish Balay       nextb = 0;
136981d82fe4SHong Zhang       while (nexta < anzi && nextb < bnzj) {
13707b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
137181d82fe4SHong Zhang         if (nexta == anzi) break;
13727b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
137381d82fe4SHong Zhang         if (nextb == bnzj) break;
137481d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13756973769bSHong Zhang           cval[j] += aval[nexta] * bval[nextb];
13769371c9d4SSatish Balay           nexta++;
13779371c9d4SSatish Balay           nextb++;
137881d82fe4SHong Zhang           flops += 2;
13791fa1209cSHong Zhang         }
13801fa1209cSHong Zhang       }
138181d82fe4SHong Zhang     }
138281d82fe4SHong Zhang   }
13839566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
13849566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
13859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13865df89d91SHong Zhang   PetscFunctionReturn(0);
13875df89d91SHong Zhang }
13885df89d91SHong Zhang 
1389d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1390d71ae5a4SJacob Faibussowitsch {
13916718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
13926d373c3eSHong Zhang 
13936d373c3eSHong Zhang   PetscFunctionBegin;
13949566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
13951baa6e33SBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
13969566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13976d373c3eSHong Zhang   PetscFunctionReturn(0);
13986d373c3eSHong Zhang }
13996d373c3eSHong Zhang 
1400d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1401d71ae5a4SJacob Faibussowitsch {
1402089a957eSStefano Zampini   Mat          At      = NULL;
14034222ddf1SHong Zhang   Mat_Product *product = C->product;
1404089a957eSStefano Zampini   PetscBool    flg, def, square;
1405bc011b1eSHong Zhang 
1406bc011b1eSHong Zhang   PetscFunctionBegin;
1407089a957eSStefano Zampini   MatCheckProduct(C, 4);
1408b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
14094222ddf1SHong Zhang   /* outerproduct */
14109566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
14114222ddf1SHong Zhang   if (flg) {
1412bc011b1eSHong Zhang     /* create symbolic At */
1413089a957eSStefano Zampini     if (!square) {
14147fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A, &At));
14159566063dSJacob Faibussowitsch       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
14169566063dSJacob Faibussowitsch       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1417089a957eSStefano Zampini     }
1418bc011b1eSHong Zhang     /* get symbolic C=At*B */
14199566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14209566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1421bc011b1eSHong Zhang 
1422bc011b1eSHong Zhang     /* clean up */
142348a46eb9SPierre Jolivet     if (!square) PetscCall(MatDestroy(&At));
14246d373c3eSHong Zhang 
14254222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14269566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
14274222ddf1SHong Zhang     PetscFunctionReturn(0);
14284222ddf1SHong Zhang   }
14294222ddf1SHong Zhang 
14304222ddf1SHong Zhang   /* matmatmult */
14319566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &def));
14329566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1433089a957eSStefano Zampini   if (flg || def) {
14344222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14354222ddf1SHong Zhang 
143628b400f6SJacob Faibussowitsch     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
14379566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
143848a46eb9SPierre Jolivet     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
14399566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14409566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
14419566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "at*b"));
14426718818eSStefano Zampini     product->data    = atb;
14436718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14444222ddf1SHong Zhang     atb->At          = At;
14454222ddf1SHong Zhang 
14464222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14474222ddf1SHong Zhang     PetscFunctionReturn(0);
14484222ddf1SHong Zhang   }
14494222ddf1SHong Zhang 
14504222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1451bc011b1eSHong Zhang }
1452bc011b1eSHong Zhang 
1453d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1454d71ae5a4SJacob Faibussowitsch {
14550fbc74f4SHong Zhang   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1456d0f46423SBarry Smith   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1457d0f46423SBarry Smith   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1458d13dce4bSSatish Balay   PetscLogDouble flops = 0.0;
1459aa1aec99SHong Zhang   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1460bc011b1eSHong Zhang 
1461bc011b1eSHong Zhang   PetscFunctionBegin;
1462aa1aec99SHong Zhang   if (!c->a) {
14639566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
14642205254eSKarl Rupp 
1465aa1aec99SHong Zhang     c->a      = ca;
1466aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1467aa1aec99SHong Zhang   } else {
1468aa1aec99SHong Zhang     ca = c->a;
14699566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm]));
1470aa1aec99SHong Zhang   }
1471bc011b1eSHong Zhang 
1472bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1473bc011b1eSHong Zhang   for (i = 0; i < am; i++) {
1474bc011b1eSHong Zhang     bj   = b->j + bi[i];
1475bc011b1eSHong Zhang     ba   = b->a + bi[i];
1476bc011b1eSHong Zhang     bnzi = bi[i + 1] - bi[i];
1477bc011b1eSHong Zhang     anzi = ai[i + 1] - ai[i];
1478bc011b1eSHong Zhang     for (j = 0; j < anzi; j++) {
1479bc011b1eSHong Zhang       nextb = 0;
14800fbc74f4SHong Zhang       crow  = *aj++;
1481bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1482bc011b1eSHong Zhang       caj   = ca + ci[crow];
1483bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1484bc011b1eSHong Zhang       for (k = 0; nextb < bnzi; k++) {
14850fbc74f4SHong Zhang         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
14860fbc74f4SHong Zhang           caj[k] += (*aa) * (*(ba + nextb));
1487bc011b1eSHong Zhang           nextb++;
1488bc011b1eSHong Zhang         }
1489bc011b1eSHong Zhang       }
1490bc011b1eSHong Zhang       flops += 2 * bnzi;
14910fbc74f4SHong Zhang       aa++;
1492bc011b1eSHong Zhang     }
1493bc011b1eSHong Zhang   }
1494bc011b1eSHong Zhang 
1495bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14969566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
14979566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
14989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
1499bc011b1eSHong Zhang   PetscFunctionReturn(0);
1500bc011b1eSHong Zhang }
15019123193aSHong Zhang 
1502d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1503d71ae5a4SJacob Faibussowitsch {
15049123193aSHong Zhang   PetscFunctionBegin;
15059566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15064222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
15079123193aSHong Zhang   PetscFunctionReturn(0);
15089123193aSHong Zhang }
15099123193aSHong Zhang 
1510d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1511d71ae5a4SJacob Faibussowitsch {
1512f32d5d43SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
15131ca9667aSStefano Zampini   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1514a4af7ca8SStefano Zampini   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1515daf57211SHong Zhang   const PetscInt    *aj;
151675f6d85dSStefano Zampini   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
151775f6d85dSStefano Zampini   PetscInt           clda;
151875f6d85dSStefano Zampini   PetscInt           am4, bm4, col, i, j, n;
15199123193aSHong Zhang 
15209123193aSHong Zhang   PetscFunctionBegin;
1521f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
15229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &av));
152393aa15f2SStefano Zampini   if (add) {
15249566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C, &c));
152593aa15f2SStefano Zampini   } else {
15269566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C, &c));
152793aa15f2SStefano Zampini   }
15289566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B, &b));
15299566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &bm));
15309566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C, &clda));
153175f6d85dSStefano Zampini   am4 = 4 * clda;
153275f6d85dSStefano Zampini   bm4 = 4 * bm;
15339371c9d4SSatish Balay   b1  = b;
15349371c9d4SSatish Balay   b2  = b1 + bm;
15359371c9d4SSatish Balay   b3  = b2 + bm;
15369371c9d4SSatish Balay   b4  = b3 + bm;
15379371c9d4SSatish Balay   c1  = c;
15389371c9d4SSatish Balay   c2  = c1 + clda;
15399371c9d4SSatish Balay   c3  = c2 + clda;
15409371c9d4SSatish Balay   c4  = c3 + clda;
15411ca9667aSStefano Zampini   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
15421ca9667aSStefano Zampini     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1543f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1544f32d5d43SBarry Smith       n                 = a->i[i + 1] - a->i[i];
1545f32d5d43SBarry Smith       aj                = a->j + a->i[i];
1546a4af7ca8SStefano Zampini       aa                = av + a->i[i];
1547f32d5d43SBarry Smith       for (j = 0; j < n; j++) {
15481ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15491ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1550730858b9SHong Zhang         r1 += aatmp * b1[ajtmp];
1551730858b9SHong Zhang         r2 += aatmp * b2[ajtmp];
1552730858b9SHong Zhang         r3 += aatmp * b3[ajtmp];
1553730858b9SHong Zhang         r4 += aatmp * b4[ajtmp];
15549123193aSHong Zhang       }
155593aa15f2SStefano Zampini       if (add) {
155687753ddeSHong Zhang         c1[i] += r1;
155787753ddeSHong Zhang         c2[i] += r2;
155887753ddeSHong Zhang         c3[i] += r3;
155987753ddeSHong Zhang         c4[i] += r4;
156093aa15f2SStefano Zampini       } else {
156193aa15f2SStefano Zampini         c1[i] = r1;
156293aa15f2SStefano Zampini         c2[i] = r2;
156393aa15f2SStefano Zampini         c3[i] = r3;
156493aa15f2SStefano Zampini         c4[i] = r4;
156593aa15f2SStefano Zampini       }
1566f32d5d43SBarry Smith     }
15679371c9d4SSatish Balay     b1 += bm4;
15689371c9d4SSatish Balay     b2 += bm4;
15699371c9d4SSatish Balay     b3 += bm4;
15709371c9d4SSatish Balay     b4 += bm4;
15719371c9d4SSatish Balay     c1 += am4;
15729371c9d4SSatish Balay     c2 += am4;
15739371c9d4SSatish Balay     c3 += am4;
15749371c9d4SSatish Balay     c4 += am4;
1575f32d5d43SBarry Smith   }
157693aa15f2SStefano Zampini   /* process remaining columns */
157793aa15f2SStefano Zampini   if (col != cn) {
157893aa15f2SStefano Zampini     PetscInt rc = cn - col;
157993aa15f2SStefano Zampini 
158093aa15f2SStefano Zampini     if (rc == 1) {
158193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
1582f32d5d43SBarry Smith         r1 = 0.0;
1583f32d5d43SBarry Smith         n  = a->i[i + 1] - a->i[i];
1584f32d5d43SBarry Smith         aj = a->j + a->i[i];
1585a4af7ca8SStefano Zampini         aa = av + a->i[i];
158693aa15f2SStefano Zampini         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
158793aa15f2SStefano Zampini         if (add) c1[i] += r1;
158893aa15f2SStefano Zampini         else c1[i] = r1;
158993aa15f2SStefano Zampini       }
159093aa15f2SStefano Zampini     } else if (rc == 2) {
159193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
159293aa15f2SStefano Zampini         r1 = r2 = 0.0;
159393aa15f2SStefano Zampini         n       = a->i[i + 1] - a->i[i];
159493aa15f2SStefano Zampini         aj      = a->j + a->i[i];
159593aa15f2SStefano Zampini         aa      = av + a->i[i];
1596f32d5d43SBarry Smith         for (j = 0; j < n; j++) {
159793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
159893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
159993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
160093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
1601f32d5d43SBarry Smith         }
160293aa15f2SStefano Zampini         if (add) {
160387753ddeSHong Zhang           c1[i] += r1;
160493aa15f2SStefano Zampini           c2[i] += r2;
160593aa15f2SStefano Zampini         } else {
160693aa15f2SStefano Zampini           c1[i] = r1;
160793aa15f2SStefano Zampini           c2[i] = r2;
1608f32d5d43SBarry Smith         }
160993aa15f2SStefano Zampini       }
161093aa15f2SStefano Zampini     } else {
161193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
161293aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
161393aa15f2SStefano Zampini         n            = a->i[i + 1] - a->i[i];
161493aa15f2SStefano Zampini         aj           = a->j + a->i[i];
161593aa15f2SStefano Zampini         aa           = av + a->i[i];
161693aa15f2SStefano Zampini         for (j = 0; j < n; j++) {
161793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
161893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
161993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
162093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
162193aa15f2SStefano Zampini           r3 += aatmp * b3[ajtmp];
162293aa15f2SStefano Zampini         }
162393aa15f2SStefano Zampini         if (add) {
162493aa15f2SStefano Zampini           c1[i] += r1;
162593aa15f2SStefano Zampini           c2[i] += r2;
162693aa15f2SStefano Zampini           c3[i] += r3;
162793aa15f2SStefano Zampini         } else {
162893aa15f2SStefano Zampini           c1[i] = r1;
162993aa15f2SStefano Zampini           c2[i] = r2;
163093aa15f2SStefano Zampini           c3[i] = r3;
163193aa15f2SStefano Zampini         }
163293aa15f2SStefano Zampini       }
163393aa15f2SStefano Zampini     }
1634f32d5d43SBarry Smith   }
16359566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
163693aa15f2SStefano Zampini   if (add) {
16379566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C, &c));
163893aa15f2SStefano Zampini   } else {
16399566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C, &c));
164093aa15f2SStefano Zampini   }
16419566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B, &b));
16429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1643f32d5d43SBarry Smith   PetscFunctionReturn(0);
1644f32d5d43SBarry Smith }
1645f32d5d43SBarry Smith 
1646d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1647d71ae5a4SJacob Faibussowitsch {
1648f32d5d43SBarry Smith   PetscFunctionBegin;
164908401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
165008401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
165108401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
16524324174eSBarry Smith 
16539566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
16549123193aSHong Zhang   PetscFunctionReturn(0);
16559123193aSHong Zhang }
1656b1683b59SHong Zhang 
16574222ddf1SHong Zhang /* ------------------------------------------------------- */
1658d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1659d71ae5a4SJacob Faibussowitsch {
16604222ddf1SHong Zhang   PetscFunctionBegin;
16614222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16624222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16634222ddf1SHong Zhang   PetscFunctionReturn(0);
16644222ddf1SHong Zhang }
16654222ddf1SHong Zhang 
16666718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
16676718818eSStefano Zampini 
1668d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1669d71ae5a4SJacob Faibussowitsch {
16704222ddf1SHong Zhang   PetscFunctionBegin;
16716718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16724222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16736718818eSStefano Zampini   PetscFunctionReturn(0);
16746718818eSStefano Zampini }
16756718818eSStefano Zampini 
1676d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1677d71ae5a4SJacob Faibussowitsch {
16786718818eSStefano Zampini   PetscFunctionBegin;
16796718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16806718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16814222ddf1SHong Zhang   PetscFunctionReturn(0);
16824222ddf1SHong Zhang }
16834222ddf1SHong Zhang 
1684d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1685d71ae5a4SJacob Faibussowitsch {
16864222ddf1SHong Zhang   Mat_Product *product = C->product;
16874222ddf1SHong Zhang 
16884222ddf1SHong Zhang   PetscFunctionBegin;
16894222ddf1SHong Zhang   switch (product->type) {
1690d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
1691d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1692d71ae5a4SJacob Faibussowitsch     break;
1693d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
1694d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1695d71ae5a4SJacob Faibussowitsch     break;
1696d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
1697d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1698d71ae5a4SJacob Faibussowitsch     break;
1699d71ae5a4SJacob Faibussowitsch   default:
1700d71ae5a4SJacob Faibussowitsch     break;
17014222ddf1SHong Zhang   }
17024222ddf1SHong Zhang   PetscFunctionReturn(0);
17034222ddf1SHong Zhang }
17044222ddf1SHong Zhang /* ------------------------------------------------------- */
1705d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1706d71ae5a4SJacob Faibussowitsch {
17074222ddf1SHong Zhang   Mat_Product *product = C->product;
17084222ddf1SHong Zhang   Mat          A       = product->A;
17094222ddf1SHong Zhang   PetscBool    baij;
17104222ddf1SHong Zhang 
17114222ddf1SHong Zhang   PetscFunctionBegin;
17129566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
17134222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
17144222ddf1SHong Zhang     PetscBool sbaij;
17159566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
171628b400f6SJacob Faibussowitsch     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
17174222ddf1SHong Zhang 
17184222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17194222ddf1SHong Zhang   } else { /* A is seqbaij */
17204222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17214222ddf1SHong Zhang   }
17224222ddf1SHong Zhang 
17234222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17244222ddf1SHong Zhang   PetscFunctionReturn(0);
17254222ddf1SHong Zhang }
17264222ddf1SHong Zhang 
1727d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1728d71ae5a4SJacob Faibussowitsch {
17294222ddf1SHong Zhang   Mat_Product *product = C->product;
17304222ddf1SHong Zhang 
17314222ddf1SHong Zhang   PetscFunctionBegin;
17326718818eSStefano Zampini   MatCheckProduct(C, 1);
173328b400f6SJacob Faibussowitsch   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1734b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
17354222ddf1SHong Zhang   PetscFunctionReturn(0);
17364222ddf1SHong Zhang }
17376718818eSStefano Zampini 
17384222ddf1SHong Zhang /* ------------------------------------------------------- */
1739d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1740d71ae5a4SJacob Faibussowitsch {
17414222ddf1SHong Zhang   PetscFunctionBegin;
17424222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17434222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17444222ddf1SHong Zhang   PetscFunctionReturn(0);
17454222ddf1SHong Zhang }
17464222ddf1SHong Zhang 
1747d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1748d71ae5a4SJacob Faibussowitsch {
17494222ddf1SHong Zhang   Mat_Product *product = C->product;
17504222ddf1SHong Zhang 
17514222ddf1SHong Zhang   PetscFunctionBegin;
175248a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
17534222ddf1SHong Zhang   PetscFunctionReturn(0);
17544222ddf1SHong Zhang }
17554222ddf1SHong Zhang /* ------------------------------------------------------- */
17564222ddf1SHong Zhang 
1757d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1758d71ae5a4SJacob Faibussowitsch {
17592f5f1f90SHong Zhang   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
17602f5f1f90SHong Zhang   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
17612f5f1f90SHong Zhang   PetscInt     *bi = b->i, *bj = b->j;
17622f5f1f90SHong Zhang   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
17632f5f1f90SHong Zhang   MatScalar    *btval, *btval_den, *ba = b->a;
17642f5f1f90SHong Zhang   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1765c8db22f6SHong Zhang 
1766c8db22f6SHong Zhang   PetscFunctionBegin;
17672f5f1f90SHong Zhang   btval_den = btdense->v;
17689566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den, m * n));
17692f5f1f90SHong Zhang   for (k = 0; k < ncolors; k++) {
17702f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17712f5f1f90SHong Zhang     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1772d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17732f5f1f90SHong Zhang       btcol = bj + bi[col];
17742f5f1f90SHong Zhang       btval = ba + bi[col];
17752f5f1f90SHong Zhang       anz   = bi[col + 1] - bi[col];
1776d2853540SHong Zhang       for (j = 0; j < anz; j++) {
17772f5f1f90SHong Zhang         brow            = btcol[j];
17782f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1779c8db22f6SHong Zhang       }
1780c8db22f6SHong Zhang     }
17812f5f1f90SHong Zhang     btval_den += m;
1782c8db22f6SHong Zhang   }
1783c8db22f6SHong Zhang   PetscFunctionReturn(0);
1784c8db22f6SHong Zhang }
1785c8db22f6SHong Zhang 
1786d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1787d71ae5a4SJacob Faibussowitsch {
1788b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
17891683a169SBarry Smith   const PetscScalar *ca_den, *ca_den_ptr;
17901683a169SBarry Smith   PetscScalar       *ca = csp->a;
1791f99a636bSHong Zhang   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1792e88777f2SHong Zhang   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1793077f23c2SHong Zhang   PetscInt           nrows, *row, *idx;
1794077f23c2SHong Zhang   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
17958972f759SHong Zhang 
17968972f759SHong Zhang   PetscFunctionBegin;
17979566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1798f99a636bSHong Zhang 
1799077f23c2SHong Zhang   if (brows > 0) {
1800077f23c2SHong Zhang     PetscInt *lstart, row_end, row_start;
1801980ae229SHong Zhang     lstart = matcoloring->lstart;
18029566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart, ncolors));
1803eeb4fd02SHong Zhang 
1804077f23c2SHong Zhang     row_end = brows;
1805eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1806077f23c2SHong Zhang     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1807077f23c2SHong Zhang       ca_den_ptr = ca_den;
1808980ae229SHong Zhang       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1809eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1810eeb4fd02SHong Zhang         row   = rows + colorforrow[k];
1811eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1812eeb4fd02SHong Zhang         for (l = lstart[k]; l < nrows; l++) {
1813eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1814eeb4fd02SHong Zhang             lstart[k] = l;
1815eeb4fd02SHong Zhang             break;
1816eeb4fd02SHong Zhang           } else {
1817077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1818eeb4fd02SHong Zhang           }
1819eeb4fd02SHong Zhang         }
1820077f23c2SHong Zhang         ca_den_ptr += m;
1821eeb4fd02SHong Zhang       }
1822077f23c2SHong Zhang       row_end += brows;
1823eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1824eeb4fd02SHong Zhang     }
1825077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1826077f23c2SHong Zhang     ca_den_ptr = ca_den;
1827077f23c2SHong Zhang     for (k = 0; k < ncolors; k++) {
1828077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1829077f23c2SHong Zhang       row   = rows + colorforrow[k];
1830077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1831ad540459SPierre Jolivet       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1832077f23c2SHong Zhang       ca_den_ptr += m;
1833077f23c2SHong Zhang     }
1834f99a636bSHong Zhang   }
1835f99a636bSHong Zhang 
18369566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1837f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1838077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18399566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1840e88777f2SHong Zhang   } else {
18419566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1842e88777f2SHong Zhang   }
1843f99a636bSHong Zhang #endif
18448972f759SHong Zhang   PetscFunctionReturn(0);
18458972f759SHong Zhang }
18468972f759SHong Zhang 
1847d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1848d71ae5a4SJacob Faibussowitsch {
1849e88777f2SHong Zhang   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
18501a83f524SJed Brown   const PetscInt *is, *ci, *cj, *row_idx;
1851b28d80bdSHong Zhang   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1852b1683b59SHong Zhang   IS             *isa;
1853b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1854e88777f2SHong Zhang   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1855e88777f2SHong Zhang   PetscInt       *colorforcol, *columns, *columns_i, brows;
1856e88777f2SHong Zhang   PetscBool       flg;
1857b1683b59SHong Zhang 
1858b1683b59SHong Zhang   PetscFunctionBegin;
18599566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1860e99be685SHong Zhang 
18617ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1862e88777f2SHong Zhang   Nbs       = mat->cmap->N / bs;
1863b1683b59SHong Zhang   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1864e88777f2SHong Zhang   c->N      = Nbs;
1865e88777f2SHong Zhang   c->m      = c->M;
1866b1683b59SHong Zhang   c->rstart = 0;
1867e88777f2SHong Zhang   c->brows  = 100;
1868b1683b59SHong Zhang 
1869b1683b59SHong Zhang   c->ncolors = nis;
18709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
18719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
18729566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1873e88777f2SHong Zhang 
1874e88777f2SHong Zhang   brows = c->brows;
18759566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1876e88777f2SHong Zhang   if (flg) c->brows = brows;
187748a46eb9SPierre Jolivet   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
18782205254eSKarl Rupp 
1879d2853540SHong Zhang   colorforrow[0] = 0;
1880d2853540SHong Zhang   rows_i         = rows;
1881f99a636bSHong Zhang   den2sp_i       = den2sp;
1882b1683b59SHong Zhang 
18839566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
18849566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs + 1, &columns));
18852205254eSKarl Rupp 
1886d2853540SHong Zhang   colorforcol[0] = 0;
1887d2853540SHong Zhang   columns_i      = columns;
1888d2853540SHong Zhang 
1889077f23c2SHong Zhang   /* get column-wise storage of mat */
18909566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1891b1683b59SHong Zhang 
1892b28d80bdSHong Zhang   cm = c->m;
18939566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &rowhit));
18949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1895f99a636bSHong Zhang   for (i = 0; i < nis; i++) { /* loop over color */
18969566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i], &n));
18979566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i], &is));
18982205254eSKarl Rupp 
1899b1683b59SHong Zhang     c->ncolumns[i] = n;
19001baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1901d2853540SHong Zhang     colorforcol[i + 1] = colorforcol[i] + n;
1902d2853540SHong Zhang     columns_i += n;
1903b1683b59SHong Zhang 
1904b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
19059566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit, cm));
1906e99be685SHong Zhang 
1907b7caf3d6SHong Zhang     for (j = 0; j < n; j++) { /* loop over columns*/
1908b1683b59SHong Zhang       col     = is[j];
1909d2853540SHong Zhang       row_idx = cj + ci[col];
1910b1683b59SHong Zhang       m       = ci[col + 1] - ci[col];
1911b7caf3d6SHong Zhang       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1912e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1913d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1914b1683b59SHong Zhang       }
1915b1683b59SHong Zhang     }
1916b1683b59SHong Zhang     /* count the number of hits */
1917b1683b59SHong Zhang     nrows = 0;
1918e8354b3eSHong Zhang     for (j = 0; j < cm; j++) {
1919b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1920b1683b59SHong Zhang     }
1921b1683b59SHong Zhang     c->nrows[i]        = nrows;
1922d2853540SHong Zhang     colorforrow[i + 1] = colorforrow[i] + nrows;
1923d2853540SHong Zhang 
1924b1683b59SHong Zhang     nrows = 0;
1925b7caf3d6SHong Zhang     for (j = 0; j < cm; j++) { /* loop over rows */
1926b1683b59SHong Zhang       if (rowhit[j]) {
1927d2853540SHong Zhang         rows_i[nrows]   = j;
192812b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1929b1683b59SHong Zhang         nrows++;
1930b1683b59SHong Zhang       }
1931b1683b59SHong Zhang     }
1932e88777f2SHong Zhang     den2sp_i += nrows;
1933077f23c2SHong Zhang 
19349566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i], &is));
1935f99a636bSHong Zhang     rows_i += nrows;
1936b1683b59SHong Zhang   }
19379566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
19389566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
19399566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
19402c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1941b28d80bdSHong Zhang 
1942d2853540SHong Zhang   c->colorforrow = colorforrow;
1943d2853540SHong Zhang   c->rows        = rows;
1944f99a636bSHong Zhang   c->den2sp      = den2sp;
1945d2853540SHong Zhang   c->colorforcol = colorforcol;
1946d2853540SHong Zhang   c->columns     = columns;
19472205254eSKarl Rupp 
19489566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
1949b1683b59SHong Zhang   PetscFunctionReturn(0);
1950b1683b59SHong Zhang }
1951d013fc79Sandi selinger 
19524222ddf1SHong Zhang /* --------------------------------------------------------------- */
1953d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1954d71ae5a4SJacob Faibussowitsch {
19554222ddf1SHong Zhang   Mat_Product *product = C->product;
19564222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19574222ddf1SHong Zhang 
1958df97dc6dSFande Kong   PetscFunctionBegin;
19594222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19604222ddf1SHong Zhang     /* Alg: "outerproduct" */
19619566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
19624222ddf1SHong Zhang   } else {
19634222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19646718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19654222ddf1SHong Zhang 
196628b400f6SJacob Faibussowitsch     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
19671cdffd5eSHong Zhang     if (atb->At) {
19681cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
19691cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19701cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A, atb->At));
19717fb60732SBarry Smith       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
19724222ddf1SHong Zhang     }
19737fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
19744222ddf1SHong Zhang   }
1975df97dc6dSFande Kong   PetscFunctionReturn(0);
1976df97dc6dSFande Kong }
1977df97dc6dSFande Kong 
1978d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1979d71ae5a4SJacob Faibussowitsch {
19804222ddf1SHong Zhang   Mat_Product *product = C->product;
19814222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19824222ddf1SHong Zhang   PetscReal    fill = product->fill;
1983d013fc79Sandi selinger 
1984d013fc79Sandi selinger   PetscFunctionBegin;
19859566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
19862869b61bSandi selinger 
19874222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19884222ddf1SHong Zhang   PetscFunctionReturn(0);
19892869b61bSandi selinger }
1990d013fc79Sandi selinger 
19914222ddf1SHong Zhang /* --------------------------------------------------------------- */
1992d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1993d71ae5a4SJacob Faibussowitsch {
19944222ddf1SHong Zhang   Mat_Product *product = C->product;
19954222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm */
19964222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
19974222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19984222ddf1SHong Zhang   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
19994222ddf1SHong Zhang   PetscInt    nalg        = 7;
20004222ddf1SHong Zhang #else
20014222ddf1SHong Zhang   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
20024222ddf1SHong Zhang   PetscInt    nalg        = 8;
20034222ddf1SHong Zhang #endif
20044222ddf1SHong Zhang 
20054222ddf1SHong Zhang   PetscFunctionBegin;
20064222ddf1SHong Zhang   /* Set default algorithm */
20079566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
200848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2009d013fc79Sandi selinger 
20104222ddf1SHong Zhang   /* Get runtime option */
20114222ddf1SHong Zhang   if (product->api_user) {
2012d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
20139566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2014d0609cedSBarry Smith     PetscOptionsEnd();
20154222ddf1SHong Zhang   } else {
2016d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
20179566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2018d0609cedSBarry Smith     PetscOptionsEnd();
2019d013fc79Sandi selinger   }
202048a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2021d013fc79Sandi selinger 
20224222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20234222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20244222ddf1SHong Zhang   PetscFunctionReturn(0);
20254222ddf1SHong Zhang }
2026d013fc79Sandi selinger 
2027d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2028d71ae5a4SJacob Faibussowitsch {
20294222ddf1SHong Zhang   Mat_Product *product     = C->product;
20304222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20314222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
2032089a957eSStefano Zampini   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2033089a957eSStefano Zampini   PetscInt     nalg        = 3;
2034d013fc79Sandi selinger 
20354222ddf1SHong Zhang   PetscFunctionBegin;
20364222ddf1SHong Zhang   /* Get runtime option */
20374222ddf1SHong Zhang   if (product->api_user) {
2038d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
20399566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2040d0609cedSBarry Smith     PetscOptionsEnd();
20414222ddf1SHong Zhang   } else {
2042d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
20439566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2044d0609cedSBarry Smith     PetscOptionsEnd();
20454222ddf1SHong Zhang   }
204648a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2047d013fc79Sandi selinger 
20484222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20494222ddf1SHong Zhang   PetscFunctionReturn(0);
20504222ddf1SHong Zhang }
20514222ddf1SHong Zhang 
2052d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2053d71ae5a4SJacob Faibussowitsch {
20544222ddf1SHong Zhang   Mat_Product *product     = C->product;
20554222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20564222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20574222ddf1SHong Zhang   const char  *algTypes[2] = {"default", "color"};
20584222ddf1SHong Zhang   PetscInt     nalg        = 2;
20594222ddf1SHong Zhang 
20604222ddf1SHong Zhang   PetscFunctionBegin;
20614222ddf1SHong Zhang   /* Set default algorithm */
20629566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
20634222ddf1SHong Zhang   if (!flg) {
20644222ddf1SHong Zhang     alg = 1;
20659566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20664222ddf1SHong Zhang   }
20674222ddf1SHong Zhang 
20684222ddf1SHong Zhang   /* Get runtime option */
20694222ddf1SHong Zhang   if (product->api_user) {
2070d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
20719566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2072d0609cedSBarry Smith     PetscOptionsEnd();
20734222ddf1SHong Zhang   } else {
2074d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
20759566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2076d0609cedSBarry Smith     PetscOptionsEnd();
20774222ddf1SHong Zhang   }
207848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20794222ddf1SHong Zhang 
20804222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20814222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20824222ddf1SHong Zhang   PetscFunctionReturn(0);
20834222ddf1SHong Zhang }
20844222ddf1SHong Zhang 
2085d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2086d71ae5a4SJacob Faibussowitsch {
20874222ddf1SHong Zhang   Mat_Product *product = C->product;
20884222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
20894222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
20904222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20914222ddf1SHong Zhang   const char *algTypes[2] = {"scalable", "rap"};
20924222ddf1SHong Zhang   PetscInt    nalg        = 2;
20934222ddf1SHong Zhang #else
20944222ddf1SHong Zhang   const char *algTypes[3] = {"scalable", "rap", "hypre"};
20954222ddf1SHong Zhang   PetscInt    nalg        = 3;
20964222ddf1SHong Zhang #endif
20974222ddf1SHong Zhang 
20984222ddf1SHong Zhang   PetscFunctionBegin;
20994222ddf1SHong Zhang   /* Set default algorithm */
21009566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
210148a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21024222ddf1SHong Zhang 
21034222ddf1SHong Zhang   /* Get runtime option */
21044222ddf1SHong Zhang   if (product->api_user) {
2105d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
21069566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2107d0609cedSBarry Smith     PetscOptionsEnd();
21084222ddf1SHong Zhang   } else {
2109d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
21109566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2111d0609cedSBarry Smith     PetscOptionsEnd();
21124222ddf1SHong Zhang   }
211348a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21144222ddf1SHong Zhang 
21154222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21164222ddf1SHong Zhang   PetscFunctionReturn(0);
21174222ddf1SHong Zhang }
21184222ddf1SHong Zhang 
2119d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2120d71ae5a4SJacob Faibussowitsch {
21214222ddf1SHong Zhang   Mat_Product *product     = C->product;
21224222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21234222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21244222ddf1SHong Zhang   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
21254222ddf1SHong Zhang   PetscInt     nalg        = 3;
21264222ddf1SHong Zhang 
21274222ddf1SHong Zhang   PetscFunctionBegin;
21284222ddf1SHong Zhang   /* Set default algorithm */
21299566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
213048a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21314222ddf1SHong Zhang 
21324222ddf1SHong Zhang   /* Get runtime option */
21334222ddf1SHong Zhang   if (product->api_user) {
2134d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
21359566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2136d0609cedSBarry Smith     PetscOptionsEnd();
21374222ddf1SHong Zhang   } else {
2138d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
21399566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2140d0609cedSBarry Smith     PetscOptionsEnd();
21414222ddf1SHong Zhang   }
214248a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21434222ddf1SHong Zhang 
21444222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21454222ddf1SHong Zhang   PetscFunctionReturn(0);
21464222ddf1SHong Zhang }
21474222ddf1SHong Zhang 
21484222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2149d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2150d71ae5a4SJacob Faibussowitsch {
21514222ddf1SHong Zhang   Mat_Product *product     = C->product;
21524222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21534222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21544222ddf1SHong Zhang   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
21554222ddf1SHong Zhang   PetscInt     nalg        = 7;
21564222ddf1SHong Zhang 
21574222ddf1SHong Zhang   PetscFunctionBegin;
21584222ddf1SHong Zhang   /* Set default algorithm */
21599566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
216048a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21614222ddf1SHong Zhang 
21624222ddf1SHong Zhang   /* Get runtime option */
21634222ddf1SHong Zhang   if (product->api_user) {
2164d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
21659566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2166d0609cedSBarry Smith     PetscOptionsEnd();
21674222ddf1SHong Zhang   } else {
2168d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
21699566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2170d0609cedSBarry Smith     PetscOptionsEnd();
21714222ddf1SHong Zhang   }
217248a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21734222ddf1SHong Zhang 
21744222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21754222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21764222ddf1SHong Zhang   PetscFunctionReturn(0);
21774222ddf1SHong Zhang }
21784222ddf1SHong Zhang 
2179d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2180d71ae5a4SJacob Faibussowitsch {
21814222ddf1SHong Zhang   Mat_Product *product = C->product;
21824222ddf1SHong Zhang 
21834222ddf1SHong Zhang   PetscFunctionBegin;
21844222ddf1SHong Zhang   switch (product->type) {
2185d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
2186d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2187d71ae5a4SJacob Faibussowitsch     break;
2188d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
2189d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2190d71ae5a4SJacob Faibussowitsch     break;
2191d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
2192d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2193d71ae5a4SJacob Faibussowitsch     break;
2194d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_PtAP:
2195d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2196d71ae5a4SJacob Faibussowitsch     break;
2197d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_RARt:
2198d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2199d71ae5a4SJacob Faibussowitsch     break;
2200d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABC:
2201d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2202d71ae5a4SJacob Faibussowitsch     break;
2203d71ae5a4SJacob Faibussowitsch   default:
2204d71ae5a4SJacob Faibussowitsch     break;
22054222ddf1SHong Zhang   }
2206d013fc79Sandi selinger   PetscFunctionReturn(0);
2207d013fc79Sandi selinger }
2208