xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
139371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) {
14df97dc6dSFande Kong   PetscFunctionBegin;
15dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
17df97dc6dSFande Kong   PetscFunctionReturn(0);
18df97dc6dSFande Kong }
19df97dc6dSFande Kong 
204222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
219371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) {
224222ddf1SHong Zhang   PetscInt    ii;
234222ddf1SHong Zhang   Mat_SeqAIJ *aij;
24cbc6b225SStefano Zampini   PetscBool   isseqaij, osingle, ofree_a, ofree_ij;
255c66b693SKris Buschelman 
265c66b693SKris Buschelman   PetscFunctionBegin;
27aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
289566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat, m, n, m, n));
294222ddf1SHong Zhang 
30e4e71118SStefano Zampini   if (!mtype) {
319566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
329566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
33e4e71118SStefano Zampini   } else {
349566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat, mtype));
35e4e71118SStefano Zampini   }
36cbc6b225SStefano Zampini 
374222ddf1SHong Zhang   aij      = (Mat_SeqAIJ *)(mat)->data;
38cbc6b225SStefano Zampini   osingle  = aij->singlemalloc;
39cbc6b225SStefano Zampini   ofree_a  = aij->free_a;
40cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
41cbc6b225SStefano Zampini   /* changes the free flags */
429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
43cbc6b225SStefano Zampini 
449566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
459566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->imax));
479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->ilen));
48cbc6b225SStefano Zampini   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49cbc6b225SStefano Zampini     const PetscInt rnz = i[ii + 1] - i[ii];
50cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
51cbc6b225SStefano Zampini     aij->rmax     = PetscMax(aij->rmax, rnz);
52cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53cbc6b225SStefano Zampini   }
54cbc6b225SStefano Zampini   aij->maxnz = i[m];
55cbc6b225SStefano Zampini   aij->nz    = i[m];
564222ddf1SHong Zhang 
57cbc6b225SStefano Zampini   if (osingle) {
589566063dSJacob Faibussowitsch     PetscCall(PetscFree3(aij->a, aij->j, aij->i));
59cbc6b225SStefano Zampini   } else {
609566063dSJacob Faibussowitsch     if (ofree_a) PetscCall(PetscFree(aij->a));
619566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->j));
629566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->i));
63cbc6b225SStefano Zampini   }
644222ddf1SHong Zhang   aij->i            = i;
654222ddf1SHong Zhang   aij->j            = j;
664222ddf1SHong Zhang   aij->a            = a;
674222ddf1SHong Zhang   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
68cbc6b225SStefano Zampini   /* default to not retain ownership */
69cbc6b225SStefano Zampini   aij->singlemalloc = PETSC_FALSE;
704222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
714222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
729566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
735c66b693SKris Buschelman   PetscFunctionReturn(0);
745c66b693SKris Buschelman }
751c24bd37SHong Zhang 
769371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) {
774222ddf1SHong Zhang   Mat_Product        *product = C->product;
784222ddf1SHong Zhang   MatProductAlgorithm alg;
794222ddf1SHong Zhang   PetscBool           flg;
804222ddf1SHong Zhang 
814222ddf1SHong Zhang   PetscFunctionBegin;
824222ddf1SHong Zhang   if (product) {
834222ddf1SHong Zhang     alg = product->alg;
844222ddf1SHong Zhang   } else {
854222ddf1SHong Zhang     alg = "sorted";
864222ddf1SHong Zhang   }
874222ddf1SHong Zhang   /* sorted */
889566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "sorted", &flg));
894222ddf1SHong Zhang   if (flg) {
909566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
914222ddf1SHong Zhang     PetscFunctionReturn(0);
924222ddf1SHong Zhang   }
934222ddf1SHong Zhang 
944222ddf1SHong Zhang   /* scalable */
959566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
964222ddf1SHong Zhang   if (flg) {
979566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
984222ddf1SHong Zhang     PetscFunctionReturn(0);
994222ddf1SHong Zhang   }
1004222ddf1SHong Zhang 
1014222ddf1SHong Zhang   /* scalable_fast */
1029566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
1034222ddf1SHong Zhang   if (flg) {
1049566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
1054222ddf1SHong Zhang     PetscFunctionReturn(0);
1064222ddf1SHong Zhang   }
1074222ddf1SHong Zhang 
1084222ddf1SHong Zhang   /* heap */
1099566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "heap", &flg));
1104222ddf1SHong Zhang   if (flg) {
1119566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
1124222ddf1SHong Zhang     PetscFunctionReturn(0);
1134222ddf1SHong Zhang   }
1144222ddf1SHong Zhang 
1154222ddf1SHong Zhang   /* btheap */
1169566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "btheap", &flg));
1174222ddf1SHong Zhang   if (flg) {
1189566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
1194222ddf1SHong Zhang     PetscFunctionReturn(0);
1204222ddf1SHong Zhang   }
1214222ddf1SHong Zhang 
1224222ddf1SHong Zhang   /* llcondensed */
1239566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
1244222ddf1SHong Zhang   if (flg) {
1259566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
1264222ddf1SHong Zhang     PetscFunctionReturn(0);
1274222ddf1SHong Zhang   }
1284222ddf1SHong Zhang 
1294222ddf1SHong Zhang   /* rowmerge */
1309566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
1314222ddf1SHong Zhang   if (flg) {
1329566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
1334222ddf1SHong Zhang     PetscFunctionReturn(0);
1344222ddf1SHong Zhang   }
1354222ddf1SHong Zhang 
1364222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1379566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
1384222ddf1SHong Zhang   if (flg) {
1399566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
1404222ddf1SHong Zhang     PetscFunctionReturn(0);
1414222ddf1SHong Zhang   }
1424222ddf1SHong Zhang #endif
1434222ddf1SHong Zhang 
1444222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1454222ddf1SHong Zhang }
1464222ddf1SHong Zhang 
1479371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) {
148b561aa9dSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
149971236abSHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
150c1ba5575SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
151b561aa9dSHong Zhang   PetscReal          afill;
152eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
153eca6b86aSHong Zhang   PetscTable         ta;
154fb908031SHong Zhang   PetscBT            lnkbt;
1550298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
156b561aa9dSHong Zhang 
157b561aa9dSHong Zhang   PetscFunctionBegin;
158bd958071SHong Zhang   /* Get ci and cj */
159bd958071SHong Zhang   /*---------------*/
160fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
161fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1629566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
163fb908031SHong Zhang   ci[0] = 0;
164fb908031SHong Zhang 
165fb908031SHong Zhang   /* create and initialize a linked list */
1669566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(bn, bn, &ta));
167c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
1689566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax));
1699566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
170eca6b86aSHong Zhang 
1719566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
172fb908031SHong Zhang 
173fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1749566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
1752205254eSKarl Rupp 
176fb908031SHong Zhang   current_space = free_space;
177fb908031SHong Zhang 
178fb908031SHong Zhang   /* Determine ci and cj */
179fb908031SHong Zhang   for (i = 0; i < am; i++) {
180fb908031SHong Zhang     anzi = ai[i + 1] - ai[i];
181fb908031SHong Zhang     aj   = a->j + ai[i];
182fb908031SHong Zhang     for (j = 0; j < anzi; j++) {
183fb908031SHong Zhang       brow = aj[j];
184fb908031SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
185fb908031SHong Zhang       bj   = b->j + bi[brow];
186fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1879566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
188fb908031SHong Zhang     }
1898c78258cSHong Zhang     /* add possible missing diagonal entry */
190*48a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
191fb908031SHong Zhang     cnzi = lnk[0];
192fb908031SHong Zhang 
193fb908031SHong Zhang     /* If free space is not available, make more free space */
194fb908031SHong Zhang     /* Double the amount of total space in the list */
195fb908031SHong Zhang     if (current_space->local_remaining < cnzi) {
1969566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
197fb908031SHong Zhang       ndouble++;
198fb908031SHong Zhang     }
199fb908031SHong Zhang 
200fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
2019566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
2022205254eSKarl Rupp 
203fb908031SHong Zhang     current_space->array += cnzi;
204fb908031SHong Zhang     current_space->local_used += cnzi;
205fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2062205254eSKarl Rupp 
207fb908031SHong Zhang     ci[i + 1] = ci[i] + cnzi;
208fb908031SHong Zhang   }
209fb908031SHong Zhang 
210fb908031SHong Zhang   /* Column indices are in the list of free space */
211fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
212fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
2149566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
2159566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
216b561aa9dSHong Zhang 
21726be0446SHong Zhang   /* put together the new symbolic matrix */
2189566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
2199566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
22058c24d83SHong Zhang 
22158c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
22258c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2234222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
224aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
225e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22658c24d83SHong Zhang   c->nonew   = 0;
2274222ddf1SHong Zhang 
2284222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2294222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2300b7e3e3dSHong Zhang 
2317212da7cSHong Zhang   /* set MatInfo */
2327212da7cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
233f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2344222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2354222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2364222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2377212da7cSHong Zhang 
2387212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2397212da7cSHong Zhang   if (ci[am]) {
2409566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
2419566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
242f2b054eeSHong Zhang   } else {
2439566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
244be0fcf8dSHong Zhang   }
245f2b054eeSHong Zhang #endif
24658c24d83SHong Zhang   PetscFunctionReturn(0);
24758c24d83SHong Zhang }
248d50806bdSBarry Smith 
2499371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) {
250d13dce4bSSatish Balay   PetscLogDouble     flops = 0.0;
251d50806bdSBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
252d50806bdSBarry Smith   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
253d50806bdSBarry Smith   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
25438baddfdSBarry Smith   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
255c58ca2e3SHong Zhang   PetscInt           am = A->rmap->n, cm = C->rmap->n;
256519eb980SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
2572e5835c6SStefano Zampini   PetscScalar       *ca, valtmp;
258aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2596718818eSStefano Zampini   PetscContainer     cab_dense;
2602e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
261d50806bdSBarry Smith 
262d50806bdSBarry Smith   PetscFunctionBegin;
2639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
2649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
265aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2669566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
267aa1aec99SHong Zhang     c->a      = ca;
268aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2696718818eSStefano Zampini   } else ca = c->a;
2706718818eSStefano Zampini 
2716718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2726718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2736718818eSStefano Zampini      that is hard to eradicate) */
2749566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
2756718818eSStefano Zampini   if (!cab_dense) {
2769566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
2779566063dSJacob Faibussowitsch     PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense));
2789566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetPointer(cab_dense, ab_dense));
2799566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault));
2809566063dSJacob Faibussowitsch     PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense));
2819566063dSJacob Faibussowitsch     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
282d90d86d0SMatthew G. Knepley   }
2839566063dSJacob Faibussowitsch   PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
2849566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
285aa1aec99SHong Zhang 
286c124e916SHong Zhang   /* clean old values in C */
2879566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
288d50806bdSBarry Smith   /* Traverse A row-wise. */
289d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
290d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
291d50806bdSBarry Smith   for (i = 0; i < am; i++) {
292d50806bdSBarry Smith     anzi = ai[i + 1] - ai[i];
293d50806bdSBarry Smith     for (j = 0; j < anzi; j++) {
294519eb980SHong Zhang       brow   = aj[j];
295d50806bdSBarry Smith       bnzi   = bi[brow + 1] - bi[brow];
296d50806bdSBarry Smith       bjj    = bj + bi[brow];
297d50806bdSBarry Smith       baj    = ba + bi[brow];
298519eb980SHong Zhang       /* perform dense axpy */
29936ec6d2dSHong Zhang       valtmp = aa[j];
3009371c9d4SSatish Balay       for (k = 0; k < bnzi; k++) { ab_dense[bjj[k]] += valtmp * baj[k]; }
301519eb980SHong Zhang       flops += 2 * bnzi;
302519eb980SHong Zhang     }
3039371c9d4SSatish Balay     aj += anzi;
3049371c9d4SSatish Balay     aa += anzi;
305c58ca2e3SHong Zhang 
306c58ca2e3SHong Zhang     cnzi = ci[i + 1] - ci[i];
307519eb980SHong Zhang     for (k = 0; k < cnzi; k++) {
308519eb980SHong Zhang       ca[k] += ab_dense[cj[k]];
309519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
310519eb980SHong Zhang     }
311519eb980SHong Zhang     flops += cnzi;
3129371c9d4SSatish Balay     cj += cnzi;
3139371c9d4SSatish Balay     ca += cnzi;
314519eb980SHong Zhang   }
3152e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3162e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3172e5835c6SStefano Zampini #endif
3189566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3199566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3209566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
323c58ca2e3SHong Zhang   PetscFunctionReturn(0);
324c58ca2e3SHong Zhang }
325c58ca2e3SHong Zhang 
3269371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) {
327c58ca2e3SHong Zhang   PetscLogDouble     flops = 0.0;
328c58ca2e3SHong Zhang   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
329c58ca2e3SHong Zhang   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
330c58ca2e3SHong Zhang   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
331c58ca2e3SHong Zhang   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
332c58ca2e3SHong Zhang   PetscInt           am = A->rmap->N, cm = C->rmap->N;
333c58ca2e3SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
3342e5835c6SStefano Zampini   PetscScalar       *ca = c->a, valtmp;
3352e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
336c58ca2e3SHong Zhang   PetscInt           nextb;
337c58ca2e3SHong Zhang 
338c58ca2e3SHong Zhang   PetscFunctionBegin;
3399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
3409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
341cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3429566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
343cf742fccSHong Zhang     c->a      = ca;
344cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
345cf742fccSHong Zhang   }
346cf742fccSHong Zhang 
347c58ca2e3SHong Zhang   /* clean old values in C */
3489566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
349c58ca2e3SHong Zhang   /* Traverse A row-wise. */
350c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
351c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
352519eb980SHong Zhang   for (i = 0; i < am; i++) {
353519eb980SHong Zhang     anzi = ai[i + 1] - ai[i];
354519eb980SHong Zhang     cnzi = ci[i + 1] - ci[i];
355519eb980SHong Zhang     for (j = 0; j < anzi; j++) {
356519eb980SHong Zhang       brow   = aj[j];
357519eb980SHong Zhang       bnzi   = bi[brow + 1] - bi[brow];
358519eb980SHong Zhang       bjj    = bj + bi[brow];
359519eb980SHong Zhang       baj    = ba + bi[brow];
360519eb980SHong Zhang       /* perform sparse axpy */
36136ec6d2dSHong Zhang       valtmp = aa[j];
362c124e916SHong Zhang       nextb  = 0;
363c124e916SHong Zhang       for (k = 0; nextb < bnzi; k++) {
364c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
36536ec6d2dSHong Zhang           ca[k] += valtmp * baj[nextb++];
366c124e916SHong Zhang         }
367d50806bdSBarry Smith       }
368d50806bdSBarry Smith       flops += 2 * bnzi;
369d50806bdSBarry Smith     }
3709371c9d4SSatish Balay     aj += anzi;
3719371c9d4SSatish Balay     aa += anzi;
3729371c9d4SSatish Balay     cj += cnzi;
3739371c9d4SSatish Balay     ca += cnzi;
374d50806bdSBarry Smith   }
3752e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3762e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3772e5835c6SStefano Zampini #endif
3789566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3799566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3809566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
383d50806bdSBarry Smith   PetscFunctionReturn(0);
384d50806bdSBarry Smith }
385bc011b1eSHong Zhang 
3869371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) {
38725296bd5SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
38825296bd5SBarry Smith   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
3893c50cad2SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
39025296bd5SBarry Smith   MatScalar         *ca;
39125296bd5SBarry Smith   PetscReal          afill;
392eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
393eca6b86aSHong Zhang   PetscTable         ta;
3940298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
39525296bd5SBarry Smith 
39625296bd5SBarry Smith   PetscFunctionBegin;
3973c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3983c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3993c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
4013c50cad2SHong Zhang   ci[0] = 0;
4023c50cad2SHong Zhang 
4033c50cad2SHong Zhang   /* create and initialize a linked list */
4049566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(bn, bn, &ta));
405c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
4069566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax));
4079566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
408eca6b86aSHong Zhang 
4099566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
4103c50cad2SHong Zhang 
4113c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4129566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
4133c50cad2SHong Zhang   current_space = free_space;
4143c50cad2SHong Zhang 
4153c50cad2SHong Zhang   /* Determine ci and cj */
4163c50cad2SHong Zhang   for (i = 0; i < am; i++) {
4173c50cad2SHong Zhang     anzi = ai[i + 1] - ai[i];
4183c50cad2SHong Zhang     aj   = a->j + ai[i];
4193c50cad2SHong Zhang     for (j = 0; j < anzi; j++) {
4203c50cad2SHong Zhang       brow = aj[j];
4213c50cad2SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
4223c50cad2SHong Zhang       bj   = b->j + bi[brow];
4233c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4249566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
4253c50cad2SHong Zhang     }
4268c78258cSHong Zhang     /* add possible missing diagonal entry */
427*48a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
4283c50cad2SHong Zhang     cnzi = lnk[1];
4293c50cad2SHong Zhang 
4303c50cad2SHong Zhang     /* If free space is not available, make more free space */
4313c50cad2SHong Zhang     /* Double the amount of total space in the list */
4323c50cad2SHong Zhang     if (current_space->local_remaining < cnzi) {
4339566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
4343c50cad2SHong Zhang       ndouble++;
4353c50cad2SHong Zhang     }
4363c50cad2SHong Zhang 
4373c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4389566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
4392205254eSKarl Rupp 
4403c50cad2SHong Zhang     current_space->array += cnzi;
4413c50cad2SHong Zhang     current_space->local_used += cnzi;
4423c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4432205254eSKarl Rupp 
4443c50cad2SHong Zhang     ci[i + 1] = ci[i] + cnzi;
4453c50cad2SHong Zhang   }
4463c50cad2SHong Zhang 
4473c50cad2SHong Zhang   /* Column indices are in the list of free space */
4483c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4493c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4509566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
4519566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
4529566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
45325296bd5SBarry Smith 
45425296bd5SBarry Smith   /* Allocate space for ca */
4559566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
45625296bd5SBarry Smith 
45725296bd5SBarry Smith   /* put together the new symbolic matrix */
4589566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
4599566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
46025296bd5SBarry Smith 
46125296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
46225296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4634222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
46425296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
46525296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
46625296bd5SBarry Smith   c->nonew   = 0;
4672205254eSKarl Rupp 
4684222ddf1SHong Zhang   /* slower, less memory */
4694222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47025296bd5SBarry Smith 
47125296bd5SBarry Smith   /* set MatInfo */
47225296bd5SBarry Smith   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
47325296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4744222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4754222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4764222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
47725296bd5SBarry Smith 
47825296bd5SBarry Smith #if defined(PETSC_USE_INFO)
47925296bd5SBarry Smith   if (ci[am]) {
4809566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
4819566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
48225296bd5SBarry Smith   } else {
4839566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
48425296bd5SBarry Smith   }
48525296bd5SBarry Smith #endif
48625296bd5SBarry Smith   PetscFunctionReturn(0);
48725296bd5SBarry Smith }
48825296bd5SBarry Smith 
4899371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) {
490e9e4536cSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
491bf9555e6SHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
49225c41797SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
493e9e4536cSHong Zhang   MatScalar         *ca;
494e9e4536cSHong Zhang   PetscReal          afill;
495eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
496eca6b86aSHong Zhang   PetscTable         ta;
4970298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
498e9e4536cSHong Zhang 
499e9e4536cSHong Zhang   PetscFunctionBegin;
500bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
501bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
502bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
5039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
504bd958071SHong Zhang   ci[0] = 0;
505bd958071SHong Zhang 
506bd958071SHong Zhang   /* create and initialize a linked list */
5079566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(bn, bn, &ta));
508c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
5099566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax));
5109566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
5119566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
512bd958071SHong Zhang 
513bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5149566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
515bd958071SHong Zhang   current_space = free_space;
516bd958071SHong Zhang 
517bd958071SHong Zhang   /* Determine ci and cj */
518bd958071SHong Zhang   for (i = 0; i < am; i++) {
519bd958071SHong Zhang     anzi = ai[i + 1] - ai[i];
520bd958071SHong Zhang     aj   = a->j + ai[i];
521bd958071SHong Zhang     for (j = 0; j < anzi; j++) {
522bd958071SHong Zhang       brow = aj[j];
523bd958071SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
524bd958071SHong Zhang       bj   = b->j + bi[brow];
525bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5269566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
527bd958071SHong Zhang     }
5288c78258cSHong Zhang     /* add possible missing diagonal entry */
529*48a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
5308c78258cSHong Zhang 
531bd958071SHong Zhang     cnzi = lnk[0];
532bd958071SHong Zhang 
533bd958071SHong Zhang     /* If free space is not available, make more free space */
534bd958071SHong Zhang     /* Double the amount of total space in the list */
535bd958071SHong Zhang     if (current_space->local_remaining < cnzi) {
5369566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
537bd958071SHong Zhang       ndouble++;
538bd958071SHong Zhang     }
539bd958071SHong Zhang 
540bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5419566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
5422205254eSKarl Rupp 
543bd958071SHong Zhang     current_space->array += cnzi;
544bd958071SHong Zhang     current_space->local_used += cnzi;
545bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5462205254eSKarl Rupp 
547bd958071SHong Zhang     ci[i + 1] = ci[i] + cnzi;
548bd958071SHong Zhang   }
549bd958071SHong Zhang 
550bd958071SHong Zhang   /* Column indices are in the list of free space */
551bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
552bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
5549566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
5559566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
556e9e4536cSHong Zhang 
557e9e4536cSHong Zhang   /* Allocate space for ca */
558bd958071SHong Zhang   /*-----------------------*/
5599566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
560e9e4536cSHong Zhang 
561e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5629566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
5639566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
564e9e4536cSHong Zhang 
565e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
566e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5674222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
568e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
569e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
570e9e4536cSHong Zhang   c->nonew   = 0;
5712205254eSKarl Rupp 
5724222ddf1SHong Zhang   /* slower, less memory */
5734222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
574e9e4536cSHong Zhang 
575e9e4536cSHong Zhang   /* set MatInfo */
576e9e4536cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
577e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5784222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5794222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5804222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
581e9e4536cSHong Zhang 
582e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
583e9e4536cSHong Zhang   if (ci[am]) {
5849566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
5859566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
586e9e4536cSHong Zhang   } else {
5879566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
588e9e4536cSHong Zhang   }
589e9e4536cSHong Zhang #endif
590e9e4536cSHong Zhang   PetscFunctionReturn(0);
591e9e4536cSHong Zhang }
592e9e4536cSHong Zhang 
5939371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) {
5940ced3a2bSJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
5950ced3a2bSJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
5960ced3a2bSJed Brown   PetscInt          *ci, *cj, *bb;
5970ced3a2bSJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
5980ced3a2bSJed Brown   PetscReal          afill;
5990ced3a2bSJed Brown   PetscInt           i, j, col, ndouble = 0;
6000298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
6010ced3a2bSJed Brown   PetscHeap          h;
6020ced3a2bSJed Brown 
6030ced3a2bSJed Brown   PetscFunctionBegin;
604cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6050ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
6060ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6079566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
6080ced3a2bSJed Brown   ci[0] = 0;
6090ced3a2bSJed Brown 
6100ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6119566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
6120ced3a2bSJed Brown   current_space = free_space;
6130ced3a2bSJed Brown 
6149566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
6159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
6160ced3a2bSJed Brown 
6170ced3a2bSJed Brown   /* Determine ci and cj */
6180ced3a2bSJed Brown   for (i = 0; i < am; i++) {
6190ced3a2bSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6200ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
6210ced3a2bSJed Brown     ci[i + 1]            = ci[i];
6220ced3a2bSJed Brown     /* Populate the min heap */
6230ced3a2bSJed Brown     for (j = 0; j < anzi; j++) {
6240ced3a2bSJed Brown       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
6250ced3a2bSJed Brown       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
6269566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
6270ced3a2bSJed Brown       }
6280ced3a2bSJed Brown     }
6290ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6309566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
6310ced3a2bSJed Brown     while (j >= 0) {
6320ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6339566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
6340ced3a2bSJed Brown         ndouble++;
6350ced3a2bSJed Brown       }
6360ced3a2bSJed Brown       *(current_space->array++) = col;
6370ced3a2bSJed Brown       current_space->local_used++;
6380ced3a2bSJed Brown       current_space->local_remaining--;
6390ced3a2bSJed Brown       ci[i + 1]++;
6400ced3a2bSJed Brown 
6410ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6429566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
6430ced3a2bSJed Brown       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
6440ced3a2bSJed Brown         PetscInt j2, col2;
6459566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h, &j2, &col2));
6460ced3a2bSJed Brown         if (col2 != col) break;
6479566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h, &j2, &col2));
6489566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
6490ced3a2bSJed Brown       }
6500ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6519566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6529566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
6530ced3a2bSJed Brown     }
6540ced3a2bSJed Brown   }
6559566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6569566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6570ced3a2bSJed Brown 
6580ced3a2bSJed Brown   /* Column indices are in the list of free space */
6590ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6600ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6619566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
6629566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
6630ced3a2bSJed Brown 
6640ced3a2bSJed Brown   /* put together the new symbolic matrix */
6659566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
6669566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6670ced3a2bSJed Brown 
6680ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6690ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6704222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
6710ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6720ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6730ced3a2bSJed Brown   c->nonew   = 0;
67426fbe8dcSKarl Rupp 
6754222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6760ced3a2bSJed Brown 
6770ced3a2bSJed Brown   /* set MatInfo */
6780ced3a2bSJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
6790ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6804222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6814222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6824222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6830ced3a2bSJed Brown 
6840ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6850ced3a2bSJed Brown   if (ci[am]) {
6869566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
6879566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
6880ced3a2bSJed Brown   } else {
6899566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
6900ced3a2bSJed Brown   }
6910ced3a2bSJed Brown #endif
6920ced3a2bSJed Brown   PetscFunctionReturn(0);
6930ced3a2bSJed Brown }
694e9e4536cSHong Zhang 
6959371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) {
6968a07c6f1SJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
6978a07c6f1SJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
6988a07c6f1SJed Brown   PetscInt          *ci, *cj, *bb;
6998a07c6f1SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
7008a07c6f1SJed Brown   PetscReal          afill;
7018a07c6f1SJed Brown   PetscInt           i, j, col, ndouble = 0;
7020298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
7038a07c6f1SJed Brown   PetscHeap          h;
7048a07c6f1SJed Brown   PetscBT            bt;
7058a07c6f1SJed Brown 
7068a07c6f1SJed Brown   PetscFunctionBegin;
707cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7088a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
7098a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7109566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
7118a07c6f1SJed Brown   ci[0] = 0;
7128a07c6f1SJed Brown 
7138a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7149566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
7152205254eSKarl Rupp 
7168a07c6f1SJed Brown   current_space = free_space;
7178a07c6f1SJed Brown 
7189566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
7199566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
7209566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn, &bt));
7218a07c6f1SJed Brown 
7228a07c6f1SJed Brown   /* Determine ci and cj */
7238a07c6f1SJed Brown   for (i = 0; i < am; i++) {
7248a07c6f1SJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7258a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
7268a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7278a07c6f1SJed Brown     ci[i + 1]            = ci[i];
7288a07c6f1SJed Brown     /* Populate the min heap */
7298a07c6f1SJed Brown     for (j = 0; j < anzi; j++) {
7308a07c6f1SJed Brown       PetscInt brow = acol[j];
7318a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
7328a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7338a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7349566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7358a07c6f1SJed Brown           bb[j]++;
7368a07c6f1SJed Brown           break;
7378a07c6f1SJed Brown         }
7388a07c6f1SJed Brown       }
7398a07c6f1SJed Brown     }
7408a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7419566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
7428a07c6f1SJed Brown     while (j >= 0) {
7438a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7440298fd71SBarry Smith         fptr = NULL;                            /* need PetscBTMemzero */
7459566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
7468a07c6f1SJed Brown         ndouble++;
7478a07c6f1SJed Brown       }
7488a07c6f1SJed Brown       *(current_space->array++) = col;
7498a07c6f1SJed Brown       current_space->local_used++;
7508a07c6f1SJed Brown       current_space->local_remaining--;
7518a07c6f1SJed Brown       ci[i + 1]++;
7528a07c6f1SJed Brown 
7538a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7548a07c6f1SJed Brown       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
7558a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7568a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7579566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7588a07c6f1SJed Brown           bb[j]++;
7598a07c6f1SJed Brown           break;
7608a07c6f1SJed Brown         }
7618a07c6f1SJed Brown       }
7629566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
7638a07c6f1SJed Brown     }
7648a07c6f1SJed Brown     if (fptr) { /* Clear the bits for this row */
7659566063dSJacob Faibussowitsch       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
7668a07c6f1SJed Brown     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7679566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn, bt));
7688a07c6f1SJed Brown     }
7698a07c6f1SJed Brown   }
7709566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7719566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7729566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7738a07c6f1SJed Brown 
7748a07c6f1SJed Brown   /* Column indices are in the list of free space */
7758a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7768a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7779566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
7789566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
7798a07c6f1SJed Brown 
7808a07c6f1SJed Brown   /* put together the new symbolic matrix */
7819566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
7829566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
7838a07c6f1SJed Brown 
7848a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7858a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7864222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
7878a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7888a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7898a07c6f1SJed Brown   c->nonew   = 0;
79026fbe8dcSKarl Rupp 
7914222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7928a07c6f1SJed Brown 
7938a07c6f1SJed Brown   /* set MatInfo */
7948a07c6f1SJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
7958a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7964222ddf1SHong Zhang   C->info.mallocs           = ndouble;
7974222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
7984222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
7998a07c6f1SJed Brown 
8008a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8018a07c6f1SJed Brown   if (ci[am]) {
8029566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
8039566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
8048a07c6f1SJed Brown   } else {
8059566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
8068a07c6f1SJed Brown   }
8078a07c6f1SJed Brown #endif
8088a07c6f1SJed Brown   PetscFunctionReturn(0);
8098a07c6f1SJed Brown }
8108a07c6f1SJed Brown 
8119371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) {
812d7ed1a4dSandi selinger   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
813d7ed1a4dSandi selinger   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
814d7ed1a4dSandi selinger   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
815d7ed1a4dSandi selinger   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
816d7ed1a4dSandi selinger   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
817d7ed1a4dSandi selinger   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
818d7ed1a4dSandi selinger   const PetscInt *brow_ptr[8], *brow_end[8];
819d7ed1a4dSandi selinger   PetscInt        window[8];
820d7ed1a4dSandi selinger   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
821d7ed1a4dSandi selinger   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
822d7ed1a4dSandi selinger   PetscReal       afill;
823f83700f2Sandi selinger   PetscInt       *workj_L1, *workj_L2, *workj_L3;
8247660c4dbSandi selinger   PetscInt        L1_nnz, L2_nnz;
825d7ed1a4dSandi selinger 
826d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
827d7ed1a4dSandi selinger              Because of the way virtual memory works,
828d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
829d7ed1a4dSandi selinger   PetscFunctionBegin;
8309566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
831d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
832d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
833d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
834d7ed1a4dSandi selinger     a_rownnz             = 0;
835d7ed1a4dSandi selinger     for (k = 0; k < anzi; ++k) {
836d7ed1a4dSandi selinger       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
837d7ed1a4dSandi selinger       if (a_rownnz > bn) {
838d7ed1a4dSandi selinger         a_rownnz = bn;
839d7ed1a4dSandi selinger         break;
840d7ed1a4dSandi selinger       }
841d7ed1a4dSandi selinger     }
842d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
843d7ed1a4dSandi selinger   }
844d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
8469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
8479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
848d7ed1a4dSandi selinger 
8497660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8507660c4dbSandi selinger   c_maxmem = 8 * (ai[am] + bi[bm]);
851d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8529566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem, &cj));
853d7ed1a4dSandi selinger 
854d7ed1a4dSandi selinger   ci_nnz      = 0;
855d7ed1a4dSandi selinger   ci[0]       = 0;
856d7ed1a4dSandi selinger   worki_L1[0] = 0;
857d7ed1a4dSandi selinger   worki_L2[0] = 0;
858d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
859d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
860d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
861d7ed1a4dSandi selinger     rowsleft             = anzi;
862d7ed1a4dSandi selinger     inputcol_L1          = acol;
863d7ed1a4dSandi selinger     L2_nnz               = 0;
8647660c4dbSandi selinger     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8657660c4dbSandi selinger     worki_L2[1]          = 0;
86608fe1b3cSKarl Rupp     outputi_nnz          = 0;
867d7ed1a4dSandi selinger 
868d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
869d7ed1a4dSandi selinger     while (ci_nnz + a_maxrownnz > c_maxmem) {
870d7ed1a4dSandi selinger       c_maxmem *= 2;
871d7ed1a4dSandi selinger       ndouble++;
8729566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
873d7ed1a4dSandi selinger     }
874d7ed1a4dSandi selinger 
875d7ed1a4dSandi selinger     while (rowsleft) {
876d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
877d7ed1a4dSandi selinger       L1_nrows    = 0;
8787660c4dbSandi selinger       L1_nnz      = 0;
879d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
880d7ed1a4dSandi selinger       inputi      = bi;
881d7ed1a4dSandi selinger       inputj      = bj;
882d7ed1a4dSandi selinger 
883d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
884d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
885f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
886d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
887d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
888d7ed1a4dSandi selinger   window_min  = bn; \
8897660c4dbSandi selinger   outputi_nnz = 0; \
8907660c4dbSandi selinger   for (k = 0; k < ANNZ; ++k) { \
891d7ed1a4dSandi selinger     brow_ptr[k] = inputj + inputi[inputcol[k]]; \
892d7ed1a4dSandi selinger     brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
893d7ed1a4dSandi selinger     window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
894d7ed1a4dSandi selinger     window_min  = PetscMin(window[k], window_min); \
895d7ed1a4dSandi selinger   } \
896d7ed1a4dSandi selinger   while (window_min < bn) { \
897d7ed1a4dSandi selinger     outputj[outputi_nnz++] = window_min; \
898d7ed1a4dSandi selinger     /* advance front and compute new minimum */ \
899d7ed1a4dSandi selinger     old_window_min         = window_min; \
900d7ed1a4dSandi selinger     window_min             = bn; \
901d7ed1a4dSandi selinger     for (k = 0; k < ANNZ; ++k) { \
902d7ed1a4dSandi selinger       if (window[k] == old_window_min) { \
903d7ed1a4dSandi selinger         brow_ptr[k]++; \
904d7ed1a4dSandi selinger         window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
905d7ed1a4dSandi selinger       } \
906d7ed1a4dSandi selinger       window_min = PetscMin(window[k], window_min); \
907d7ed1a4dSandi selinger     } \
908d7ed1a4dSandi selinger   }
909d7ed1a4dSandi selinger 
910d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
911d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
912d7ed1a4dSandi selinger       while (L1_rowsleft) {
9137660c4dbSandi selinger         outputi_nnz = 0;
9147660c4dbSandi selinger         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
9157660c4dbSandi selinger         else outputj = cj + ci_nnz;                /* Merge directly to C */
9167660c4dbSandi selinger 
917d7ed1a4dSandi selinger         switch (L1_rowsleft) {
9189371c9d4SSatish Balay         case 1:
9199371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
920d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
921d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
922d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
923d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
924d7ed1a4dSandi selinger           L1_rowsleft = 0;
925d7ed1a4dSandi selinger           break;
9269371c9d4SSatish Balay         case 2:
9279371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(2);
928d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
929d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
930d7ed1a4dSandi selinger           L1_rowsleft = 0;
931d7ed1a4dSandi selinger           break;
9329371c9d4SSatish Balay         case 3:
9339371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(3);
934d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
935d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
936d7ed1a4dSandi selinger           L1_rowsleft = 0;
937d7ed1a4dSandi selinger           break;
9389371c9d4SSatish Balay         case 4:
9399371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(4);
940d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
941d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
942d7ed1a4dSandi selinger           L1_rowsleft = 0;
943d7ed1a4dSandi selinger           break;
9449371c9d4SSatish Balay         case 5:
9459371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(5);
946d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
947d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
948d7ed1a4dSandi selinger           L1_rowsleft = 0;
949d7ed1a4dSandi selinger           break;
9509371c9d4SSatish Balay         case 6:
9519371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(6);
952d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
953d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
954d7ed1a4dSandi selinger           L1_rowsleft = 0;
955d7ed1a4dSandi selinger           break;
9569371c9d4SSatish Balay         case 7:
9579371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(7);
958d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
959d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
960d7ed1a4dSandi selinger           L1_rowsleft = 0;
961d7ed1a4dSandi selinger           break;
9629371c9d4SSatish Balay         default:
9639371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(8);
964d7ed1a4dSandi selinger           inputcol += 8;
965d7ed1a4dSandi selinger           rowsleft -= 8;
966d7ed1a4dSandi selinger           L1_rowsleft -= 8;
967d7ed1a4dSandi selinger           break;
968d7ed1a4dSandi selinger         }
969d7ed1a4dSandi selinger         inputcol_L1 = inputcol;
9707660c4dbSandi selinger         L1_nnz += outputi_nnz;
9717660c4dbSandi selinger         worki_L1[++L1_nrows] = L1_nnz;
972d7ed1a4dSandi selinger       }
973d7ed1a4dSandi selinger 
974d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
975d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
976d7ed1a4dSandi selinger       if (anzi > 8) {
977d7ed1a4dSandi selinger         inputi      = worki_L1;
978d7ed1a4dSandi selinger         inputj      = workj_L1;
979d7ed1a4dSandi selinger         inputcol    = workcol;
980d7ed1a4dSandi selinger         outputi_nnz = 0;
981d7ed1a4dSandi selinger 
982d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
983d7ed1a4dSandi selinger         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
984d7ed1a4dSandi selinger 
985d7ed1a4dSandi selinger         switch (L1_nrows) {
9869371c9d4SSatish Balay         case 1:
9879371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
988d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
989d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
990d7ed1a4dSandi selinger           break;
991d7ed1a4dSandi selinger         case 2: MatMatMultSymbolic_RowMergeMacro(2); break;
992d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3); break;
993d7ed1a4dSandi selinger         case 4: MatMatMultSymbolic_RowMergeMacro(4); break;
994d7ed1a4dSandi selinger         case 5: MatMatMultSymbolic_RowMergeMacro(5); break;
995d7ed1a4dSandi selinger         case 6: MatMatMultSymbolic_RowMergeMacro(6); break;
996d7ed1a4dSandi selinger         case 7: MatMatMultSymbolic_RowMergeMacro(7); break;
997d7ed1a4dSandi selinger         case 8: MatMatMultSymbolic_RowMergeMacro(8); break;
998d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
999d7ed1a4dSandi selinger         }
1000d7ed1a4dSandi selinger         L2_nnz += outputi_nnz;
1001d7ed1a4dSandi selinger         worki_L2[++L2_nrows] = L2_nnz;
1002d7ed1a4dSandi selinger 
10037660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10047660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1005d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1006d7ed1a4dSandi selinger           inputi      = worki_L2;
1007d7ed1a4dSandi selinger           inputj      = workj_L2;
1008d7ed1a4dSandi selinger           inputcol    = workcol;
1009d7ed1a4dSandi selinger           outputi_nnz = 0;
1010f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1011d7ed1a4dSandi selinger           else outputj = cj + ci_nnz;
1012d7ed1a4dSandi selinger           switch (L2_nrows) {
10139371c9d4SSatish Balay           case 1:
10149371c9d4SSatish Balay             brow_ptr[0] = inputj + inputi[inputcol[0]];
1015d7ed1a4dSandi selinger             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1016d7ed1a4dSandi selinger             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1017d7ed1a4dSandi selinger             break;
1018d7ed1a4dSandi selinger           case 2: MatMatMultSymbolic_RowMergeMacro(2); break;
1019d7ed1a4dSandi selinger           case 3: MatMatMultSymbolic_RowMergeMacro(3); break;
1020d7ed1a4dSandi selinger           case 4: MatMatMultSymbolic_RowMergeMacro(4); break;
1021d7ed1a4dSandi selinger           case 5: MatMatMultSymbolic_RowMergeMacro(5); break;
1022d7ed1a4dSandi selinger           case 6: MatMatMultSymbolic_RowMergeMacro(6); break;
1023d7ed1a4dSandi selinger           case 7: MatMatMultSymbolic_RowMergeMacro(7); break;
1024d7ed1a4dSandi selinger           case 8: MatMatMultSymbolic_RowMergeMacro(8); break;
1025d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1026d7ed1a4dSandi selinger           }
1027d7ed1a4dSandi selinger           L2_nrows    = 1;
10287660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10297660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10307660c4dbSandi selinger           /* Copy to workj_L2 */
1031d7ed1a4dSandi selinger           if (rowsleft) {
10327660c4dbSandi selinger             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1033d7ed1a4dSandi selinger           }
1034d7ed1a4dSandi selinger         }
1035d7ed1a4dSandi selinger       }
1036d7ed1a4dSandi selinger     } /* while (rowsleft) */
1037d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1038d7ed1a4dSandi selinger 
1039d7ed1a4dSandi selinger     /* terminate current row */
1040d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1041d7ed1a4dSandi selinger     ci[i + 1] = ci_nnz;
1042d7ed1a4dSandi selinger   }
1043d7ed1a4dSandi selinger 
1044d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10459566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
10469566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1047d7ed1a4dSandi selinger 
1048d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1049d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10504222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
1051d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1052d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1053d7ed1a4dSandi selinger   c->nonew   = 0;
1054d7ed1a4dSandi selinger 
10554222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1056d7ed1a4dSandi selinger 
1057d7ed1a4dSandi selinger   /* set MatInfo */
1058d7ed1a4dSandi selinger   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1059d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10604222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10614222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10624222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1063d7ed1a4dSandi selinger 
1064d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1065d7ed1a4dSandi selinger   if (ci[am]) {
10669566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
10679566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1068d7ed1a4dSandi selinger   } else {
10699566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1070d7ed1a4dSandi selinger   }
1071d7ed1a4dSandi selinger #endif
1072d7ed1a4dSandi selinger 
1073d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
10749566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
10759566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
10769566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
1077d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1078d7ed1a4dSandi selinger }
1079d7ed1a4dSandi selinger 
1080cd093f1dSJed Brown /* concatenate unique entries and then sort */
10819371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) {
1082cd093f1dSJed Brown   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1083cd093f1dSJed Brown   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
10848c78258cSHong Zhang   PetscInt       *ci, *cj, bcol;
1085cd093f1dSJed Brown   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1086cd093f1dSJed Brown   PetscReal       afill;
1087cd093f1dSJed Brown   PetscInt        i, j, ndouble = 0;
1088cd093f1dSJed Brown   PetscSegBuffer  seg, segrow;
1089cd093f1dSJed Brown   char           *seen;
1090cd093f1dSJed Brown 
1091cd093f1dSJed Brown   PetscFunctionBegin;
10929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
1093cd093f1dSJed Brown   ci[0] = 0;
1094cd093f1dSJed Brown 
1095cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
10969566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
10979566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
10989566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn, &seen));
1099cd093f1dSJed Brown 
1100cd093f1dSJed Brown   /* Determine ci and cj */
1101cd093f1dSJed Brown   for (i = 0; i < am; i++) {
1102cd093f1dSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1103cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
1104cd093f1dSJed Brown     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
11058c78258cSHong Zhang 
1106cd093f1dSJed Brown     /* Pack segrow */
1107cd093f1dSJed Brown     for (j = 0; j < anzi; j++) {
1108cd093f1dSJed Brown       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1109cd093f1dSJed Brown       for (k = bjstart; k < bjend; k++) {
11108c78258cSHong Zhang         bcol = bj[k];
1111cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1112cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11139566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1114cd093f1dSJed Brown           *slot      = bcol;
1115cd093f1dSJed Brown           seen[bcol] = 1;
1116cd093f1dSJed Brown           packlen++;
1117cd093f1dSJed Brown         }
1118cd093f1dSJed Brown       }
1119cd093f1dSJed Brown     }
11208c78258cSHong Zhang 
11218c78258cSHong Zhang     /* Check i-th diagonal entry */
11228c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11238c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11249566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
11258c78258cSHong Zhang       *slot   = i;
11268c78258cSHong Zhang       seen[i] = 1;
11278c78258cSHong Zhang       packlen++;
11288c78258cSHong Zhang     }
11298c78258cSHong Zhang 
11309566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
11319566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow, crow));
11329566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen, crow));
1133cd093f1dSJed Brown     ci[i + 1] = ci[i] + packlen;
1134cd093f1dSJed Brown     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1135cd093f1dSJed Brown   }
11369566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11379566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1138cd093f1dSJed Brown 
1139cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11409566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
11419566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1142cd093f1dSJed Brown 
1143cd093f1dSJed Brown   /* put together the new symbolic matrix */
11449566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
11459566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1146cd093f1dSJed Brown 
1147cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1148cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11494222ddf1SHong Zhang   c          = (Mat_SeqAIJ *)(C->data);
1150cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1151cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1152cd093f1dSJed Brown   c->nonew   = 0;
1153cd093f1dSJed Brown 
11544222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1155cd093f1dSJed Brown 
1156cd093f1dSJed Brown   /* set MatInfo */
11572a09556fSStefano Zampini   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1158cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11594222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11604222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11614222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1162cd093f1dSJed Brown 
1163cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1164cd093f1dSJed Brown   if (ci[am]) {
11659566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
11669566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1167cd093f1dSJed Brown   } else {
11689566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1169cd093f1dSJed Brown   }
1170cd093f1dSJed Brown #endif
1171cd093f1dSJed Brown   PetscFunctionReturn(0);
1172cd093f1dSJed Brown }
1173cd093f1dSJed Brown 
11749371c9d4SSatish Balay PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) {
11756718818eSStefano Zampini   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
11762128a86cSHong Zhang 
11772128a86cSHong Zhang   PetscFunctionBegin;
11789566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
11799566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
11809566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
11819566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
11822128a86cSHong Zhang   PetscFunctionReturn(0);
11832128a86cSHong Zhang }
11842128a86cSHong Zhang 
11859371c9d4SSatish Balay PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) {
118681d82fe4SHong Zhang   Mat                  Bt;
118740192850SHong Zhang   Mat_MatMatTransMult *abt;
11884222ddf1SHong Zhang   Mat_Product         *product = C->product;
11896718818eSStefano Zampini   char                *alg;
1190d2853540SHong Zhang 
119181d82fe4SHong Zhang   PetscFunctionBegin;
119228b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
119328b400f6SJacob Faibussowitsch   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
11946718818eSStefano Zampini 
119581d82fe4SHong Zhang   /* create symbolic Bt */
11967fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B, &Bt));
11979566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
11989566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
119981d82fe4SHong Zhang 
120081d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12019566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg, &alg));
12029566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
12039566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
12049566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
12059566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
120681d82fe4SHong Zhang 
1207a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12089566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12092128a86cSHong Zhang 
12106718818eSStefano Zampini   product->data    = abt;
12116718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12126718818eSStefano Zampini 
12134222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12142128a86cSHong Zhang 
12154222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12169566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
121740192850SHong Zhang   if (abt->usecoloring) {
1218b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1219b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1220335efc43SPeter Brune     MatColoring          coloring;
12212128a86cSHong Zhang     ISColoring           iscoloring;
12222128a86cSHong Zhang     Mat                  Bt_dense, C_dense;
1223e8354b3eSHong Zhang 
12244222ddf1SHong Zhang     /* inode causes memory problem */
12259566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
12264222ddf1SHong Zhang 
12279566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C, &coloring));
12289566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring, 2));
12299566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
12309566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12319566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring, &iscoloring));
12329566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12339566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
12342205254eSKarl Rupp 
123540192850SHong Zhang     abt->matcoloring = matcoloring;
12362205254eSKarl Rupp 
12379566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12382128a86cSHong Zhang 
12392128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12409566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
12419566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
12429566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
12439566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
12442205254eSKarl Rupp 
12452128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
124640192850SHong Zhang     abt->Bt_den         = Bt_dense;
12472128a86cSHong Zhang 
12489566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
12499566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
12509566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense, MATSEQDENSE));
12519566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
12522205254eSKarl Rupp 
12532128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
125440192850SHong Zhang     abt->ABt_den        = C_dense;
1255f94ccd6cSHong Zhang 
1256f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12571ce71dffSSatish Balay     {
12584222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
12599371c9d4SSatish Balay       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
12609371c9d4SSatish Balay                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
12611ce71dffSSatish Balay     }
1262f94ccd6cSHong Zhang #endif
12632128a86cSHong Zhang   }
1264e99be685SHong Zhang   /* clean up */
12659566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
12665df89d91SHong Zhang   PetscFunctionReturn(0);
12675df89d91SHong Zhang }
12685df89d91SHong Zhang 
12699371c9d4SSatish Balay PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) {
12705df89d91SHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1271e2cac8caSJed Brown   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
127281d82fe4SHong Zhang   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
12735df89d91SHong Zhang   PetscLogDouble       flops = 0.0;
1274aa1aec99SHong Zhang   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
12756718818eSStefano Zampini   Mat_MatMatTransMult *abt;
12766718818eSStefano Zampini   Mat_Product         *product = C->product;
12775df89d91SHong Zhang 
12785df89d91SHong Zhang   PetscFunctionBegin;
127928b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
12806718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
128128b400f6SJacob Faibussowitsch   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
128258ed3ceaSHong Zhang   /* clear old values in C */
128358ed3ceaSHong Zhang   if (!c->a) {
12849566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
128558ed3ceaSHong Zhang     c->a      = ca;
128658ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
128758ed3ceaSHong Zhang   } else {
128858ed3ceaSHong Zhang     ca = c->a;
12899566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
129058ed3ceaSHong Zhang   }
129158ed3ceaSHong Zhang 
129240192850SHong Zhang   if (abt->usecoloring) {
129340192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
129440192850SHong Zhang     Mat                  Bt_dense, C_dense = abt->ABt_den;
1295c8db22f6SHong Zhang 
1296b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
129740192850SHong Zhang     Bt_dense = abt->Bt_den;
12989566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1299c8db22f6SHong Zhang 
1300c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13019566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1302c8db22f6SHong Zhang 
13032128a86cSHong Zhang     /* Recover C from C_dense */
13049566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1305c8db22f6SHong Zhang     PetscFunctionReturn(0);
1306c8db22f6SHong Zhang   }
1307c8db22f6SHong Zhang 
13081fa1209cSHong Zhang   for (i = 0; i < cm; i++) {
130981d82fe4SHong Zhang     anzi = ai[i + 1] - ai[i];
131081d82fe4SHong Zhang     acol = aj + ai[i];
13116973769bSHong Zhang     aval = aa + ai[i];
13121fa1209cSHong Zhang     cnzi = ci[i + 1] - ci[i];
13131fa1209cSHong Zhang     ccol = cj + ci[i];
13146973769bSHong Zhang     cval = ca + ci[i];
13151fa1209cSHong Zhang     for (j = 0; j < cnzi; j++) {
131681d82fe4SHong Zhang       brow = ccol[j];
131781d82fe4SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
131881d82fe4SHong Zhang       bcol = bj + bi[brow];
13196973769bSHong Zhang       bval = ba + bi[brow];
13206973769bSHong Zhang 
132181d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
13229371c9d4SSatish Balay       nexta = 0;
13239371c9d4SSatish Balay       nextb = 0;
132481d82fe4SHong Zhang       while (nexta < anzi && nextb < bnzj) {
13257b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
132681d82fe4SHong Zhang         if (nexta == anzi) break;
13277b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
132881d82fe4SHong Zhang         if (nextb == bnzj) break;
132981d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13306973769bSHong Zhang           cval[j] += aval[nexta] * bval[nextb];
13319371c9d4SSatish Balay           nexta++;
13329371c9d4SSatish Balay           nextb++;
133381d82fe4SHong Zhang           flops += 2;
13341fa1209cSHong Zhang         }
13351fa1209cSHong Zhang       }
133681d82fe4SHong Zhang     }
133781d82fe4SHong Zhang   }
13389566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
13399566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
13409566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13415df89d91SHong Zhang   PetscFunctionReturn(0);
13425df89d91SHong Zhang }
13435df89d91SHong Zhang 
13449371c9d4SSatish Balay PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) {
13456718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
13466d373c3eSHong Zhang 
13476d373c3eSHong Zhang   PetscFunctionBegin;
13489566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
13491baa6e33SBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
13509566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13516d373c3eSHong Zhang   PetscFunctionReturn(0);
13526d373c3eSHong Zhang }
13536d373c3eSHong Zhang 
13549371c9d4SSatish Balay PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) {
1355089a957eSStefano Zampini   Mat          At      = NULL;
13564222ddf1SHong Zhang   Mat_Product *product = C->product;
1357089a957eSStefano Zampini   PetscBool    flg, def, square;
1358bc011b1eSHong Zhang 
1359bc011b1eSHong Zhang   PetscFunctionBegin;
1360089a957eSStefano Zampini   MatCheckProduct(C, 4);
1361b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
13624222ddf1SHong Zhang   /* outerproduct */
13639566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
13644222ddf1SHong Zhang   if (flg) {
1365bc011b1eSHong Zhang     /* create symbolic At */
1366089a957eSStefano Zampini     if (!square) {
13677fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A, &At));
13689566063dSJacob Faibussowitsch       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
13699566063dSJacob Faibussowitsch       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1370089a957eSStefano Zampini     }
1371bc011b1eSHong Zhang     /* get symbolic C=At*B */
13729566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
13739566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1374bc011b1eSHong Zhang 
1375bc011b1eSHong Zhang     /* clean up */
1376*48a46eb9SPierre Jolivet     if (!square) PetscCall(MatDestroy(&At));
13776d373c3eSHong Zhang 
13784222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
13799566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
13804222ddf1SHong Zhang     PetscFunctionReturn(0);
13814222ddf1SHong Zhang   }
13824222ddf1SHong Zhang 
13834222ddf1SHong Zhang   /* matmatmult */
13849566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &def));
13859566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1386089a957eSStefano Zampini   if (flg || def) {
13874222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
13884222ddf1SHong Zhang 
138928b400f6SJacob Faibussowitsch     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
13909566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
1391*48a46eb9SPierre Jolivet     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
13929566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
13939566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
13949566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "at*b"));
13956718818eSStefano Zampini     product->data    = atb;
13966718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
13974222ddf1SHong Zhang     atb->At          = At;
13984222ddf1SHong Zhang 
13994222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14004222ddf1SHong Zhang     PetscFunctionReturn(0);
14014222ddf1SHong Zhang   }
14024222ddf1SHong Zhang 
14034222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1404bc011b1eSHong Zhang }
1405bc011b1eSHong Zhang 
14069371c9d4SSatish Balay PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) {
14070fbc74f4SHong Zhang   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1408d0f46423SBarry Smith   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1409d0f46423SBarry Smith   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1410d13dce4bSSatish Balay   PetscLogDouble flops = 0.0;
1411aa1aec99SHong Zhang   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1412bc011b1eSHong Zhang 
1413bc011b1eSHong Zhang   PetscFunctionBegin;
1414aa1aec99SHong Zhang   if (!c->a) {
14159566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
14162205254eSKarl Rupp 
1417aa1aec99SHong Zhang     c->a      = ca;
1418aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1419aa1aec99SHong Zhang   } else {
1420aa1aec99SHong Zhang     ca = c->a;
14219566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm]));
1422aa1aec99SHong Zhang   }
1423bc011b1eSHong Zhang 
1424bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1425bc011b1eSHong Zhang   for (i = 0; i < am; i++) {
1426bc011b1eSHong Zhang     bj   = b->j + bi[i];
1427bc011b1eSHong Zhang     ba   = b->a + bi[i];
1428bc011b1eSHong Zhang     bnzi = bi[i + 1] - bi[i];
1429bc011b1eSHong Zhang     anzi = ai[i + 1] - ai[i];
1430bc011b1eSHong Zhang     for (j = 0; j < anzi; j++) {
1431bc011b1eSHong Zhang       nextb = 0;
14320fbc74f4SHong Zhang       crow  = *aj++;
1433bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1434bc011b1eSHong Zhang       caj   = ca + ci[crow];
1435bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1436bc011b1eSHong Zhang       for (k = 0; nextb < bnzi; k++) {
14370fbc74f4SHong Zhang         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
14380fbc74f4SHong Zhang           caj[k] += (*aa) * (*(ba + nextb));
1439bc011b1eSHong Zhang           nextb++;
1440bc011b1eSHong Zhang         }
1441bc011b1eSHong Zhang       }
1442bc011b1eSHong Zhang       flops += 2 * bnzi;
14430fbc74f4SHong Zhang       aa++;
1444bc011b1eSHong Zhang     }
1445bc011b1eSHong Zhang   }
1446bc011b1eSHong Zhang 
1447bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14489566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
14499566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
14509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
1451bc011b1eSHong Zhang   PetscFunctionReturn(0);
1452bc011b1eSHong Zhang }
14539123193aSHong Zhang 
14549371c9d4SSatish Balay PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) {
14559123193aSHong Zhang   PetscFunctionBegin;
14569566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
14574222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14589123193aSHong Zhang   PetscFunctionReturn(0);
14599123193aSHong Zhang }
14609123193aSHong Zhang 
14619371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) {
1462f32d5d43SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
14631ca9667aSStefano Zampini   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1464a4af7ca8SStefano Zampini   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1465daf57211SHong Zhang   const PetscInt    *aj;
146675f6d85dSStefano Zampini   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
146775f6d85dSStefano Zampini   PetscInt           clda;
146875f6d85dSStefano Zampini   PetscInt           am4, bm4, col, i, j, n;
14699123193aSHong Zhang 
14709123193aSHong Zhang   PetscFunctionBegin;
1471f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
14729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &av));
147393aa15f2SStefano Zampini   if (add) {
14749566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C, &c));
147593aa15f2SStefano Zampini   } else {
14769566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C, &c));
147793aa15f2SStefano Zampini   }
14789566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B, &b));
14799566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &bm));
14809566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C, &clda));
148175f6d85dSStefano Zampini   am4 = 4 * clda;
148275f6d85dSStefano Zampini   bm4 = 4 * bm;
14839371c9d4SSatish Balay   b1  = b;
14849371c9d4SSatish Balay   b2  = b1 + bm;
14859371c9d4SSatish Balay   b3  = b2 + bm;
14869371c9d4SSatish Balay   b4  = b3 + bm;
14879371c9d4SSatish Balay   c1  = c;
14889371c9d4SSatish Balay   c2  = c1 + clda;
14899371c9d4SSatish Balay   c3  = c2 + clda;
14909371c9d4SSatish Balay   c4  = c3 + clda;
14911ca9667aSStefano Zampini   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
14921ca9667aSStefano Zampini     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1493f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1494f32d5d43SBarry Smith       n                 = a->i[i + 1] - a->i[i];
1495f32d5d43SBarry Smith       aj                = a->j + a->i[i];
1496a4af7ca8SStefano Zampini       aa                = av + a->i[i];
1497f32d5d43SBarry Smith       for (j = 0; j < n; j++) {
14981ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
14991ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1500730858b9SHong Zhang         r1 += aatmp * b1[ajtmp];
1501730858b9SHong Zhang         r2 += aatmp * b2[ajtmp];
1502730858b9SHong Zhang         r3 += aatmp * b3[ajtmp];
1503730858b9SHong Zhang         r4 += aatmp * b4[ajtmp];
15049123193aSHong Zhang       }
150593aa15f2SStefano Zampini       if (add) {
150687753ddeSHong Zhang         c1[i] += r1;
150787753ddeSHong Zhang         c2[i] += r2;
150887753ddeSHong Zhang         c3[i] += r3;
150987753ddeSHong Zhang         c4[i] += r4;
151093aa15f2SStefano Zampini       } else {
151193aa15f2SStefano Zampini         c1[i] = r1;
151293aa15f2SStefano Zampini         c2[i] = r2;
151393aa15f2SStefano Zampini         c3[i] = r3;
151493aa15f2SStefano Zampini         c4[i] = r4;
151593aa15f2SStefano Zampini       }
1516f32d5d43SBarry Smith     }
15179371c9d4SSatish Balay     b1 += bm4;
15189371c9d4SSatish Balay     b2 += bm4;
15199371c9d4SSatish Balay     b3 += bm4;
15209371c9d4SSatish Balay     b4 += bm4;
15219371c9d4SSatish Balay     c1 += am4;
15229371c9d4SSatish Balay     c2 += am4;
15239371c9d4SSatish Balay     c3 += am4;
15249371c9d4SSatish Balay     c4 += am4;
1525f32d5d43SBarry Smith   }
152693aa15f2SStefano Zampini   /* process remaining columns */
152793aa15f2SStefano Zampini   if (col != cn) {
152893aa15f2SStefano Zampini     PetscInt rc = cn - col;
152993aa15f2SStefano Zampini 
153093aa15f2SStefano Zampini     if (rc == 1) {
153193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
1532f32d5d43SBarry Smith         r1 = 0.0;
1533f32d5d43SBarry Smith         n  = a->i[i + 1] - a->i[i];
1534f32d5d43SBarry Smith         aj = a->j + a->i[i];
1535a4af7ca8SStefano Zampini         aa = av + a->i[i];
153693aa15f2SStefano Zampini         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
153793aa15f2SStefano Zampini         if (add) c1[i] += r1;
153893aa15f2SStefano Zampini         else c1[i] = r1;
153993aa15f2SStefano Zampini       }
154093aa15f2SStefano Zampini     } else if (rc == 2) {
154193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
154293aa15f2SStefano Zampini         r1 = r2 = 0.0;
154393aa15f2SStefano Zampini         n       = a->i[i + 1] - a->i[i];
154493aa15f2SStefano Zampini         aj      = a->j + a->i[i];
154593aa15f2SStefano Zampini         aa      = av + a->i[i];
1546f32d5d43SBarry Smith         for (j = 0; j < n; j++) {
154793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
154893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
154993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
155093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
1551f32d5d43SBarry Smith         }
155293aa15f2SStefano Zampini         if (add) {
155387753ddeSHong Zhang           c1[i] += r1;
155493aa15f2SStefano Zampini           c2[i] += r2;
155593aa15f2SStefano Zampini         } else {
155693aa15f2SStefano Zampini           c1[i] = r1;
155793aa15f2SStefano Zampini           c2[i] = r2;
1558f32d5d43SBarry Smith         }
155993aa15f2SStefano Zampini       }
156093aa15f2SStefano Zampini     } else {
156193aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
156293aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
156393aa15f2SStefano Zampini         n            = a->i[i + 1] - a->i[i];
156493aa15f2SStefano Zampini         aj           = a->j + a->i[i];
156593aa15f2SStefano Zampini         aa           = av + a->i[i];
156693aa15f2SStefano Zampini         for (j = 0; j < n; j++) {
156793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
156893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
156993aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
157093aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
157193aa15f2SStefano Zampini           r3 += aatmp * b3[ajtmp];
157293aa15f2SStefano Zampini         }
157393aa15f2SStefano Zampini         if (add) {
157493aa15f2SStefano Zampini           c1[i] += r1;
157593aa15f2SStefano Zampini           c2[i] += r2;
157693aa15f2SStefano Zampini           c3[i] += r3;
157793aa15f2SStefano Zampini         } else {
157893aa15f2SStefano Zampini           c1[i] = r1;
157993aa15f2SStefano Zampini           c2[i] = r2;
158093aa15f2SStefano Zampini           c3[i] = r3;
158193aa15f2SStefano Zampini         }
158293aa15f2SStefano Zampini       }
158393aa15f2SStefano Zampini     }
1584f32d5d43SBarry Smith   }
15859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
158693aa15f2SStefano Zampini   if (add) {
15879566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C, &c));
158893aa15f2SStefano Zampini   } else {
15899566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C, &c));
159093aa15f2SStefano Zampini   }
15919566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B, &b));
15929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1593f32d5d43SBarry Smith   PetscFunctionReturn(0);
1594f32d5d43SBarry Smith }
1595f32d5d43SBarry Smith 
15969371c9d4SSatish Balay PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) {
1597f32d5d43SBarry Smith   PetscFunctionBegin;
159808401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
159908401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
160008401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
16014324174eSBarry Smith 
16029566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
16039123193aSHong Zhang   PetscFunctionReturn(0);
16049123193aSHong Zhang }
1605b1683b59SHong Zhang 
16064222ddf1SHong Zhang /* ------------------------------------------------------- */
16079371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) {
16084222ddf1SHong Zhang   PetscFunctionBegin;
16094222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16104222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16114222ddf1SHong Zhang   PetscFunctionReturn(0);
16124222ddf1SHong Zhang }
16134222ddf1SHong Zhang 
16146718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
16156718818eSStefano Zampini 
16169371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) {
16174222ddf1SHong Zhang   PetscFunctionBegin;
16186718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16194222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16206718818eSStefano Zampini   PetscFunctionReturn(0);
16216718818eSStefano Zampini }
16226718818eSStefano Zampini 
16239371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) {
16246718818eSStefano Zampini   PetscFunctionBegin;
16256718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16266718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16274222ddf1SHong Zhang   PetscFunctionReturn(0);
16284222ddf1SHong Zhang }
16294222ddf1SHong Zhang 
16309371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) {
16314222ddf1SHong Zhang   Mat_Product *product = C->product;
16324222ddf1SHong Zhang 
16334222ddf1SHong Zhang   PetscFunctionBegin;
16344222ddf1SHong Zhang   switch (product->type) {
16359371c9d4SSatish Balay   case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); break;
16369371c9d4SSatish Balay   case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); break;
16379371c9d4SSatish Balay   case MATPRODUCT_ABt: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); break;
16389371c9d4SSatish Balay   default: break;
16394222ddf1SHong Zhang   }
16404222ddf1SHong Zhang   PetscFunctionReturn(0);
16414222ddf1SHong Zhang }
16424222ddf1SHong Zhang /* ------------------------------------------------------- */
16439371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) {
16444222ddf1SHong Zhang   Mat_Product *product = C->product;
16454222ddf1SHong Zhang   Mat          A       = product->A;
16464222ddf1SHong Zhang   PetscBool    baij;
16474222ddf1SHong Zhang 
16484222ddf1SHong Zhang   PetscFunctionBegin;
16499566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
16504222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
16514222ddf1SHong Zhang     PetscBool sbaij;
16529566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
165328b400f6SJacob Faibussowitsch     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
16544222ddf1SHong Zhang 
16554222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
16564222ddf1SHong Zhang   } else { /* A is seqbaij */
16574222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
16584222ddf1SHong Zhang   }
16594222ddf1SHong Zhang 
16604222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16614222ddf1SHong Zhang   PetscFunctionReturn(0);
16624222ddf1SHong Zhang }
16634222ddf1SHong Zhang 
16649371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) {
16654222ddf1SHong Zhang   Mat_Product *product = C->product;
16664222ddf1SHong Zhang 
16674222ddf1SHong Zhang   PetscFunctionBegin;
16686718818eSStefano Zampini   MatCheckProduct(C, 1);
166928b400f6SJacob Faibussowitsch   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1670b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
16714222ddf1SHong Zhang   PetscFunctionReturn(0);
16724222ddf1SHong Zhang }
16736718818eSStefano Zampini 
16744222ddf1SHong Zhang /* ------------------------------------------------------- */
16759371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) {
16764222ddf1SHong Zhang   PetscFunctionBegin;
16774222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
16784222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16794222ddf1SHong Zhang   PetscFunctionReturn(0);
16804222ddf1SHong Zhang }
16814222ddf1SHong Zhang 
16829371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) {
16834222ddf1SHong Zhang   Mat_Product *product = C->product;
16844222ddf1SHong Zhang 
16854222ddf1SHong Zhang   PetscFunctionBegin;
1686*48a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
16874222ddf1SHong Zhang   PetscFunctionReturn(0);
16884222ddf1SHong Zhang }
16894222ddf1SHong Zhang /* ------------------------------------------------------- */
16904222ddf1SHong Zhang 
16919371c9d4SSatish Balay PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) {
16922f5f1f90SHong Zhang   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
16932f5f1f90SHong Zhang   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
16942f5f1f90SHong Zhang   PetscInt     *bi = b->i, *bj = b->j;
16952f5f1f90SHong Zhang   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
16962f5f1f90SHong Zhang   MatScalar    *btval, *btval_den, *ba = b->a;
16972f5f1f90SHong Zhang   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1698c8db22f6SHong Zhang 
1699c8db22f6SHong Zhang   PetscFunctionBegin;
17002f5f1f90SHong Zhang   btval_den = btdense->v;
17019566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den, m * n));
17022f5f1f90SHong Zhang   for (k = 0; k < ncolors; k++) {
17032f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17042f5f1f90SHong Zhang     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1705d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17062f5f1f90SHong Zhang       btcol = bj + bi[col];
17072f5f1f90SHong Zhang       btval = ba + bi[col];
17082f5f1f90SHong Zhang       anz   = bi[col + 1] - bi[col];
1709d2853540SHong Zhang       for (j = 0; j < anz; j++) {
17102f5f1f90SHong Zhang         brow            = btcol[j];
17112f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1712c8db22f6SHong Zhang       }
1713c8db22f6SHong Zhang     }
17142f5f1f90SHong Zhang     btval_den += m;
1715c8db22f6SHong Zhang   }
1716c8db22f6SHong Zhang   PetscFunctionReturn(0);
1717c8db22f6SHong Zhang }
1718c8db22f6SHong Zhang 
17199371c9d4SSatish Balay PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) {
1720b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
17211683a169SBarry Smith   const PetscScalar *ca_den, *ca_den_ptr;
17221683a169SBarry Smith   PetscScalar       *ca = csp->a;
1723f99a636bSHong Zhang   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1724e88777f2SHong Zhang   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1725077f23c2SHong Zhang   PetscInt           nrows, *row, *idx;
1726077f23c2SHong Zhang   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
17278972f759SHong Zhang 
17288972f759SHong Zhang   PetscFunctionBegin;
17299566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1730f99a636bSHong Zhang 
1731077f23c2SHong Zhang   if (brows > 0) {
1732077f23c2SHong Zhang     PetscInt *lstart, row_end, row_start;
1733980ae229SHong Zhang     lstart = matcoloring->lstart;
17349566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart, ncolors));
1735eeb4fd02SHong Zhang 
1736077f23c2SHong Zhang     row_end = brows;
1737eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1738077f23c2SHong Zhang     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1739077f23c2SHong Zhang       ca_den_ptr = ca_den;
1740980ae229SHong Zhang       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1741eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1742eeb4fd02SHong Zhang         row   = rows + colorforrow[k];
1743eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1744eeb4fd02SHong Zhang         for (l = lstart[k]; l < nrows; l++) {
1745eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1746eeb4fd02SHong Zhang             lstart[k] = l;
1747eeb4fd02SHong Zhang             break;
1748eeb4fd02SHong Zhang           } else {
1749077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1750eeb4fd02SHong Zhang           }
1751eeb4fd02SHong Zhang         }
1752077f23c2SHong Zhang         ca_den_ptr += m;
1753eeb4fd02SHong Zhang       }
1754077f23c2SHong Zhang       row_end += brows;
1755eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1756eeb4fd02SHong Zhang     }
1757077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1758077f23c2SHong Zhang     ca_den_ptr = ca_den;
1759077f23c2SHong Zhang     for (k = 0; k < ncolors; k++) {
1760077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1761077f23c2SHong Zhang       row   = rows + colorforrow[k];
1762077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
17639371c9d4SSatish Balay       for (l = 0; l < nrows; l++) { ca[idx[l]] = ca_den_ptr[row[l]]; }
1764077f23c2SHong Zhang       ca_den_ptr += m;
1765077f23c2SHong Zhang     }
1766f99a636bSHong Zhang   }
1767f99a636bSHong Zhang 
17689566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1769f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1770077f23c2SHong Zhang   if (matcoloring->brows > 0) {
17719566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1772e88777f2SHong Zhang   } else {
17739566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1774e88777f2SHong Zhang   }
1775f99a636bSHong Zhang #endif
17768972f759SHong Zhang   PetscFunctionReturn(0);
17778972f759SHong Zhang }
17788972f759SHong Zhang 
17799371c9d4SSatish Balay PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) {
1780e88777f2SHong Zhang   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
17811a83f524SJed Brown   const PetscInt *is, *ci, *cj, *row_idx;
1782b28d80bdSHong Zhang   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1783b1683b59SHong Zhang   IS             *isa;
1784b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1785e88777f2SHong Zhang   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1786e88777f2SHong Zhang   PetscInt       *colorforcol, *columns, *columns_i, brows;
1787e88777f2SHong Zhang   PetscBool       flg;
1788b1683b59SHong Zhang 
1789b1683b59SHong Zhang   PetscFunctionBegin;
17909566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1791e99be685SHong Zhang 
17927ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1793e88777f2SHong Zhang   Nbs       = mat->cmap->N / bs;
1794b1683b59SHong Zhang   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1795e88777f2SHong Zhang   c->N      = Nbs;
1796e88777f2SHong Zhang   c->m      = c->M;
1797b1683b59SHong Zhang   c->rstart = 0;
1798e88777f2SHong Zhang   c->brows  = 100;
1799b1683b59SHong Zhang 
1800b1683b59SHong Zhang   c->ncolors = nis;
18019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
18029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
18039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1804e88777f2SHong Zhang 
1805e88777f2SHong Zhang   brows = c->brows;
18069566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1807e88777f2SHong Zhang   if (flg) c->brows = brows;
1808*48a46eb9SPierre Jolivet   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
18092205254eSKarl Rupp 
1810d2853540SHong Zhang   colorforrow[0] = 0;
1811d2853540SHong Zhang   rows_i         = rows;
1812f99a636bSHong Zhang   den2sp_i       = den2sp;
1813b1683b59SHong Zhang 
18149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
18159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs + 1, &columns));
18162205254eSKarl Rupp 
1817d2853540SHong Zhang   colorforcol[0] = 0;
1818d2853540SHong Zhang   columns_i      = columns;
1819d2853540SHong Zhang 
1820077f23c2SHong Zhang   /* get column-wise storage of mat */
18219566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1822b1683b59SHong Zhang 
1823b28d80bdSHong Zhang   cm = c->m;
18249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &rowhit));
18259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1826f99a636bSHong Zhang   for (i = 0; i < nis; i++) { /* loop over color */
18279566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i], &n));
18289566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i], &is));
18292205254eSKarl Rupp 
1830b1683b59SHong Zhang     c->ncolumns[i] = n;
18311baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1832d2853540SHong Zhang     colorforcol[i + 1] = colorforcol[i] + n;
1833d2853540SHong Zhang     columns_i += n;
1834b1683b59SHong Zhang 
1835b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
18369566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit, cm));
1837e99be685SHong Zhang 
1838b7caf3d6SHong Zhang     for (j = 0; j < n; j++) { /* loop over columns*/
1839b1683b59SHong Zhang       col     = is[j];
1840d2853540SHong Zhang       row_idx = cj + ci[col];
1841b1683b59SHong Zhang       m       = ci[col + 1] - ci[col];
1842b7caf3d6SHong Zhang       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1843e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1844d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1845b1683b59SHong Zhang       }
1846b1683b59SHong Zhang     }
1847b1683b59SHong Zhang     /* count the number of hits */
1848b1683b59SHong Zhang     nrows = 0;
1849e8354b3eSHong Zhang     for (j = 0; j < cm; j++) {
1850b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1851b1683b59SHong Zhang     }
1852b1683b59SHong Zhang     c->nrows[i]        = nrows;
1853d2853540SHong Zhang     colorforrow[i + 1] = colorforrow[i] + nrows;
1854d2853540SHong Zhang 
1855b1683b59SHong Zhang     nrows = 0;
1856b7caf3d6SHong Zhang     for (j = 0; j < cm; j++) { /* loop over rows */
1857b1683b59SHong Zhang       if (rowhit[j]) {
1858d2853540SHong Zhang         rows_i[nrows]   = j;
185912b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1860b1683b59SHong Zhang         nrows++;
1861b1683b59SHong Zhang       }
1862b1683b59SHong Zhang     }
1863e88777f2SHong Zhang     den2sp_i += nrows;
1864077f23c2SHong Zhang 
18659566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i], &is));
1866f99a636bSHong Zhang     rows_i += nrows;
1867b1683b59SHong Zhang   }
18689566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
18699566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
18709566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
18712c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1872b28d80bdSHong Zhang 
1873d2853540SHong Zhang   c->colorforrow = colorforrow;
1874d2853540SHong Zhang   c->rows        = rows;
1875f99a636bSHong Zhang   c->den2sp      = den2sp;
1876d2853540SHong Zhang   c->colorforcol = colorforcol;
1877d2853540SHong Zhang   c->columns     = columns;
18782205254eSKarl Rupp 
18799566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
1880b1683b59SHong Zhang   PetscFunctionReturn(0);
1881b1683b59SHong Zhang }
1882d013fc79Sandi selinger 
18834222ddf1SHong Zhang /* --------------------------------------------------------------- */
18849371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) {
18854222ddf1SHong Zhang   Mat_Product *product = C->product;
18864222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
18874222ddf1SHong Zhang 
1888df97dc6dSFande Kong   PetscFunctionBegin;
18894222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
18904222ddf1SHong Zhang     /* Alg: "outerproduct" */
18919566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
18924222ddf1SHong Zhang   } else {
18934222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
18946718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
18954222ddf1SHong Zhang 
189628b400f6SJacob Faibussowitsch     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
18971cdffd5eSHong Zhang     if (atb->At) {
18981cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
18991cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19001cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A, atb->At));
19017fb60732SBarry Smith       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
19024222ddf1SHong Zhang     }
19037fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
19044222ddf1SHong Zhang   }
1905df97dc6dSFande Kong   PetscFunctionReturn(0);
1906df97dc6dSFande Kong }
1907df97dc6dSFande Kong 
19089371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) {
19094222ddf1SHong Zhang   Mat_Product *product = C->product;
19104222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19114222ddf1SHong Zhang   PetscReal    fill = product->fill;
1912d013fc79Sandi selinger 
1913d013fc79Sandi selinger   PetscFunctionBegin;
19149566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
19152869b61bSandi selinger 
19164222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19174222ddf1SHong Zhang   PetscFunctionReturn(0);
19182869b61bSandi selinger }
1919d013fc79Sandi selinger 
19204222ddf1SHong Zhang /* --------------------------------------------------------------- */
19219371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) {
19224222ddf1SHong Zhang   Mat_Product *product = C->product;
19234222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm */
19244222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
19254222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19264222ddf1SHong Zhang   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
19274222ddf1SHong Zhang   PetscInt    nalg        = 7;
19284222ddf1SHong Zhang #else
19294222ddf1SHong Zhang   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
19304222ddf1SHong Zhang   PetscInt    nalg        = 8;
19314222ddf1SHong Zhang #endif
19324222ddf1SHong Zhang 
19334222ddf1SHong Zhang   PetscFunctionBegin;
19344222ddf1SHong Zhang   /* Set default algorithm */
19359566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
1936*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
1937d013fc79Sandi selinger 
19384222ddf1SHong Zhang   /* Get runtime option */
19394222ddf1SHong Zhang   if (product->api_user) {
1940d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
19419566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
1942d0609cedSBarry Smith     PetscOptionsEnd();
19434222ddf1SHong Zhang   } else {
1944d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
19459566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
1946d0609cedSBarry Smith     PetscOptionsEnd();
1947d013fc79Sandi selinger   }
1948*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
1949d013fc79Sandi selinger 
19504222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
19514222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
19524222ddf1SHong Zhang   PetscFunctionReturn(0);
19534222ddf1SHong Zhang }
1954d013fc79Sandi selinger 
19559371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) {
19564222ddf1SHong Zhang   Mat_Product *product     = C->product;
19574222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
19584222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
1959089a957eSStefano Zampini   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
1960089a957eSStefano Zampini   PetscInt     nalg        = 3;
1961d013fc79Sandi selinger 
19624222ddf1SHong Zhang   PetscFunctionBegin;
19634222ddf1SHong Zhang   /* Get runtime option */
19644222ddf1SHong Zhang   if (product->api_user) {
1965d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
19669566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
1967d0609cedSBarry Smith     PetscOptionsEnd();
19684222ddf1SHong Zhang   } else {
1969d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
19709566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
1971d0609cedSBarry Smith     PetscOptionsEnd();
19724222ddf1SHong Zhang   }
1973*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
1974d013fc79Sandi selinger 
19754222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
19764222ddf1SHong Zhang   PetscFunctionReturn(0);
19774222ddf1SHong Zhang }
19784222ddf1SHong Zhang 
19799371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) {
19804222ddf1SHong Zhang   Mat_Product *product     = C->product;
19814222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
19824222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
19834222ddf1SHong Zhang   const char  *algTypes[2] = {"default", "color"};
19844222ddf1SHong Zhang   PetscInt     nalg        = 2;
19854222ddf1SHong Zhang 
19864222ddf1SHong Zhang   PetscFunctionBegin;
19874222ddf1SHong Zhang   /* Set default algorithm */
19889566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
19894222ddf1SHong Zhang   if (!flg) {
19904222ddf1SHong Zhang     alg = 1;
19919566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
19924222ddf1SHong Zhang   }
19934222ddf1SHong Zhang 
19944222ddf1SHong Zhang   /* Get runtime option */
19954222ddf1SHong Zhang   if (product->api_user) {
1996d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
19979566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
1998d0609cedSBarry Smith     PetscOptionsEnd();
19994222ddf1SHong Zhang   } else {
2000d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
20019566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2002d0609cedSBarry Smith     PetscOptionsEnd();
20034222ddf1SHong Zhang   }
2004*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20054222ddf1SHong Zhang 
20064222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20074222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20084222ddf1SHong Zhang   PetscFunctionReturn(0);
20094222ddf1SHong Zhang }
20104222ddf1SHong Zhang 
20119371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) {
20124222ddf1SHong Zhang   Mat_Product *product = C->product;
20134222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
20144222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
20154222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20164222ddf1SHong Zhang   const char *algTypes[2] = {"scalable", "rap"};
20174222ddf1SHong Zhang   PetscInt    nalg        = 2;
20184222ddf1SHong Zhang #else
20194222ddf1SHong Zhang   const char *algTypes[3] = {"scalable", "rap", "hypre"};
20204222ddf1SHong Zhang   PetscInt    nalg        = 3;
20214222ddf1SHong Zhang #endif
20224222ddf1SHong Zhang 
20234222ddf1SHong Zhang   PetscFunctionBegin;
20244222ddf1SHong Zhang   /* Set default algorithm */
20259566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2026*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20274222ddf1SHong Zhang 
20284222ddf1SHong Zhang   /* Get runtime option */
20294222ddf1SHong Zhang   if (product->api_user) {
2030d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
20319566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2032d0609cedSBarry Smith     PetscOptionsEnd();
20334222ddf1SHong Zhang   } else {
2034d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
20359566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2036d0609cedSBarry Smith     PetscOptionsEnd();
20374222ddf1SHong Zhang   }
2038*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20394222ddf1SHong Zhang 
20404222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
20414222ddf1SHong Zhang   PetscFunctionReturn(0);
20424222ddf1SHong Zhang }
20434222ddf1SHong Zhang 
20449371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) {
20454222ddf1SHong Zhang   Mat_Product *product     = C->product;
20464222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20474222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20484222ddf1SHong Zhang   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
20494222ddf1SHong Zhang   PetscInt     nalg        = 3;
20504222ddf1SHong Zhang 
20514222ddf1SHong Zhang   PetscFunctionBegin;
20524222ddf1SHong Zhang   /* Set default algorithm */
20539566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2054*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20554222ddf1SHong Zhang 
20564222ddf1SHong Zhang   /* Get runtime option */
20574222ddf1SHong Zhang   if (product->api_user) {
2058d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
20599566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2060d0609cedSBarry Smith     PetscOptionsEnd();
20614222ddf1SHong Zhang   } else {
2062d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
20639566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2064d0609cedSBarry Smith     PetscOptionsEnd();
20654222ddf1SHong Zhang   }
2066*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20674222ddf1SHong Zhang 
20684222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
20694222ddf1SHong Zhang   PetscFunctionReturn(0);
20704222ddf1SHong Zhang }
20714222ddf1SHong Zhang 
20724222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
20739371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) {
20744222ddf1SHong Zhang   Mat_Product *product     = C->product;
20754222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20764222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20774222ddf1SHong Zhang   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
20784222ddf1SHong Zhang   PetscInt     nalg        = 7;
20794222ddf1SHong Zhang 
20804222ddf1SHong Zhang   PetscFunctionBegin;
20814222ddf1SHong Zhang   /* Set default algorithm */
20829566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2083*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20844222ddf1SHong Zhang 
20854222ddf1SHong Zhang   /* Get runtime option */
20864222ddf1SHong Zhang   if (product->api_user) {
2087d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
20889566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2089d0609cedSBarry Smith     PetscOptionsEnd();
20904222ddf1SHong Zhang   } else {
2091d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
20929566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2093d0609cedSBarry Smith     PetscOptionsEnd();
20944222ddf1SHong Zhang   }
2095*48a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20964222ddf1SHong Zhang 
20974222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
20984222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
20994222ddf1SHong Zhang   PetscFunctionReturn(0);
21004222ddf1SHong Zhang }
21014222ddf1SHong Zhang 
21029371c9d4SSatish Balay PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) {
21034222ddf1SHong Zhang   Mat_Product *product = C->product;
21044222ddf1SHong Zhang 
21054222ddf1SHong Zhang   PetscFunctionBegin;
21064222ddf1SHong Zhang   switch (product->type) {
21079371c9d4SSatish Balay   case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); break;
21089371c9d4SSatish Balay   case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); break;
21099371c9d4SSatish Balay   case MATPRODUCT_ABt: PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); break;
21109371c9d4SSatish Balay   case MATPRODUCT_PtAP: PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); break;
21119371c9d4SSatish Balay   case MATPRODUCT_RARt: PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); break;
21129371c9d4SSatish Balay   case MATPRODUCT_ABC: PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); break;
21139371c9d4SSatish Balay   default: break;
21144222ddf1SHong Zhang   }
2115d013fc79Sandi selinger   PetscFunctionReturn(0);
2116d013fc79Sandi selinger }
2117