xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision f4f49eeac7efa77fffa46b7ff95a3ed169f659ed)
1d50806bdSBarry Smith /*
22499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3d50806bdSBarry Smith           C = A * B
4d50806bdSBarry Smith */
5d50806bdSBarry Smith 
6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
8c6db04a5SJed Brown #include <petscbt.h>
9af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1007475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
117bab7c10SHong Zhang 
12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13d71ae5a4SJacob Faibussowitsch {
14df97dc6dSFande Kong   PetscFunctionBegin;
15dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18df97dc6dSFande Kong }
19df97dc6dSFande Kong 
204222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
21d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22d71ae5a4SJacob Faibussowitsch {
234222ddf1SHong Zhang   PetscInt    ii;
244222ddf1SHong Zhang   Mat_SeqAIJ *aij;
25cbc6b225SStefano Zampini   PetscBool   isseqaij, osingle, ofree_a, ofree_ij;
265c66b693SKris Buschelman 
275c66b693SKris Buschelman   PetscFunctionBegin;
28aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
299566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat, m, n, m, n));
304222ddf1SHong Zhang 
31e4e71118SStefano Zampini   if (!mtype) {
329566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
339566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34e4e71118SStefano Zampini   } else {
359566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat, mtype));
36e4e71118SStefano Zampini   }
37cbc6b225SStefano Zampini 
384222ddf1SHong Zhang   aij      = (Mat_SeqAIJ *)(mat)->data;
39cbc6b225SStefano Zampini   osingle  = aij->singlemalloc;
40cbc6b225SStefano Zampini   ofree_a  = aij->free_a;
41cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
42cbc6b225SStefano Zampini   /* changes the free flags */
439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
44cbc6b225SStefano Zampini 
459566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
469566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->imax));
489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->ilen));
49cbc6b225SStefano Zampini   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
50cbc6b225SStefano Zampini     const PetscInt rnz = i[ii + 1] - i[ii];
51cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
52cbc6b225SStefano Zampini     aij->rmax     = PetscMax(aij->rmax, rnz);
53cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
54cbc6b225SStefano Zampini   }
55cbc6b225SStefano Zampini   aij->maxnz = i[m];
56cbc6b225SStefano Zampini   aij->nz    = i[m];
574222ddf1SHong Zhang 
58cbc6b225SStefano Zampini   if (osingle) {
599566063dSJacob Faibussowitsch     PetscCall(PetscFree3(aij->a, aij->j, aij->i));
60cbc6b225SStefano Zampini   } else {
619566063dSJacob Faibussowitsch     if (ofree_a) PetscCall(PetscFree(aij->a));
629566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->j));
639566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->i));
64cbc6b225SStefano Zampini   }
654222ddf1SHong Zhang   aij->i     = i;
664222ddf1SHong Zhang   aij->j     = j;
674222ddf1SHong Zhang   aij->a     = a;
684222ddf1SHong Zhang   aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
69cbc6b225SStefano Zampini   /* default to not retain ownership */
70cbc6b225SStefano Zampini   aij->singlemalloc = PETSC_FALSE;
714222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
724222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
739566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
746dc08606SJunchao Zhang   // Always build the diag info when i, j are set
756dc08606SJunchao Zhang   PetscCall(MatMarkDiagonal_SeqAIJ(mat));
763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
775c66b693SKris Buschelman }
781c24bd37SHong Zhang 
79d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
80d71ae5a4SJacob Faibussowitsch {
814222ddf1SHong Zhang   Mat_Product        *product = C->product;
824222ddf1SHong Zhang   MatProductAlgorithm alg;
834222ddf1SHong Zhang   PetscBool           flg;
844222ddf1SHong Zhang 
854222ddf1SHong Zhang   PetscFunctionBegin;
864222ddf1SHong Zhang   if (product) {
874222ddf1SHong Zhang     alg = product->alg;
884222ddf1SHong Zhang   } else {
894222ddf1SHong Zhang     alg = "sorted";
904222ddf1SHong Zhang   }
914222ddf1SHong Zhang   /* sorted */
929566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "sorted", &flg));
934222ddf1SHong Zhang   if (flg) {
949566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
953ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
964222ddf1SHong Zhang   }
974222ddf1SHong Zhang 
984222ddf1SHong Zhang   /* scalable */
999566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
1004222ddf1SHong Zhang   if (flg) {
1019566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
1023ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1034222ddf1SHong Zhang   }
1044222ddf1SHong Zhang 
1054222ddf1SHong Zhang   /* scalable_fast */
1069566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
1074222ddf1SHong Zhang   if (flg) {
1089566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
1093ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1104222ddf1SHong Zhang   }
1114222ddf1SHong Zhang 
1124222ddf1SHong Zhang   /* heap */
1139566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "heap", &flg));
1144222ddf1SHong Zhang   if (flg) {
1159566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
1163ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1174222ddf1SHong Zhang   }
1184222ddf1SHong Zhang 
1194222ddf1SHong Zhang   /* btheap */
1209566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "btheap", &flg));
1214222ddf1SHong Zhang   if (flg) {
1229566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
1233ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1244222ddf1SHong Zhang   }
1254222ddf1SHong Zhang 
1264222ddf1SHong Zhang   /* llcondensed */
1279566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
1284222ddf1SHong Zhang   if (flg) {
1299566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
1303ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1314222ddf1SHong Zhang   }
1324222ddf1SHong Zhang 
1334222ddf1SHong Zhang   /* rowmerge */
1349566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
1354222ddf1SHong Zhang   if (flg) {
1369566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
1373ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1384222ddf1SHong Zhang   }
1394222ddf1SHong Zhang 
1404222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1419566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
1424222ddf1SHong Zhang   if (flg) {
1439566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
1443ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1454222ddf1SHong Zhang   }
1464222ddf1SHong Zhang #endif
1474222ddf1SHong Zhang 
1484222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1494222ddf1SHong Zhang }
1504222ddf1SHong Zhang 
151d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
152d71ae5a4SJacob Faibussowitsch {
153b561aa9dSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
154971236abSHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
155c1ba5575SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
156b561aa9dSHong Zhang   PetscReal          afill;
157eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
158eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
159fb908031SHong Zhang   PetscBT            lnkbt;
1600298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
161b561aa9dSHong Zhang 
162b561aa9dSHong Zhang   PetscFunctionBegin;
163bd958071SHong Zhang   /* Get ci and cj */
164fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
165fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
167fb908031SHong Zhang   ci[0] = 0;
168fb908031SHong Zhang 
169fb908031SHong Zhang   /* create and initialize a linked list */
170eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
171c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
172eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
173eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
174eca6b86aSHong Zhang 
1759566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
176fb908031SHong Zhang 
177fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1789566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
1792205254eSKarl Rupp 
180fb908031SHong Zhang   current_space = free_space;
181fb908031SHong Zhang 
182fb908031SHong Zhang   /* Determine ci and cj */
183fb908031SHong Zhang   for (i = 0; i < am; i++) {
184fb908031SHong Zhang     anzi = ai[i + 1] - ai[i];
185fb908031SHong Zhang     aj   = a->j + ai[i];
186fb908031SHong Zhang     for (j = 0; j < anzi; j++) {
187fb908031SHong Zhang       brow = aj[j];
188fb908031SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
189fb908031SHong Zhang       bj   = b->j + bi[brow];
190fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1919566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
192fb908031SHong Zhang     }
1938c78258cSHong Zhang     /* add possible missing diagonal entry */
19448a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
195fb908031SHong Zhang     cnzi = lnk[0];
196fb908031SHong Zhang 
197fb908031SHong Zhang     /* If free space is not available, make more free space */
198fb908031SHong Zhang     /* Double the amount of total space in the list */
199fb908031SHong Zhang     if (current_space->local_remaining < cnzi) {
2009566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
201fb908031SHong Zhang       ndouble++;
202fb908031SHong Zhang     }
203fb908031SHong Zhang 
204fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
2059566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
2062205254eSKarl Rupp 
207fb908031SHong Zhang     current_space->array += cnzi;
208fb908031SHong Zhang     current_space->local_used += cnzi;
209fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2102205254eSKarl Rupp 
211fb908031SHong Zhang     ci[i + 1] = ci[i] + cnzi;
212fb908031SHong Zhang   }
213fb908031SHong Zhang 
214fb908031SHong Zhang   /* Column indices are in the list of free space */
215fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
216fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2179566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
2189566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
2199566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
220b561aa9dSHong Zhang 
22126be0446SHong Zhang   /* put together the new symbolic matrix */
2229566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
2239566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
22458c24d83SHong Zhang 
22558c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
22658c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
227*f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
228aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
229e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
23058c24d83SHong Zhang   c->nonew   = 0;
2314222ddf1SHong Zhang 
2324222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2334222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2340b7e3e3dSHong Zhang 
2357212da7cSHong Zhang   /* set MatInfo */
2367212da7cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
237f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2384222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2394222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2404222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2417212da7cSHong Zhang 
2427212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2437212da7cSHong Zhang   if (ci[am]) {
2449566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
2459566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
246f2b054eeSHong Zhang   } else {
2479566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
248be0fcf8dSHong Zhang   }
249f2b054eeSHong Zhang #endif
2503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25158c24d83SHong Zhang }
252d50806bdSBarry Smith 
253d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
254d71ae5a4SJacob Faibussowitsch {
255d13dce4bSSatish Balay   PetscLogDouble     flops = 0.0;
256d50806bdSBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
257d50806bdSBarry Smith   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
258d50806bdSBarry Smith   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
25938baddfdSBarry Smith   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
260c58ca2e3SHong Zhang   PetscInt           am = A->rmap->n, cm = C->rmap->n;
261519eb980SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
2622e5835c6SStefano Zampini   PetscScalar       *ca, valtmp;
263aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2646718818eSStefano Zampini   PetscContainer     cab_dense;
2652e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
266d50806bdSBarry Smith 
267d50806bdSBarry Smith   PetscFunctionBegin;
2689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
2699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
270aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2719566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
272aa1aec99SHong Zhang     c->a      = ca;
273aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2746718818eSStefano Zampini   } else ca = c->a;
2756718818eSStefano Zampini 
2766718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2776718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2786718818eSStefano Zampini      that is hard to eradicate) */
2799566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
2806718818eSStefano Zampini   if (!cab_dense) {
2819566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
2829566063dSJacob Faibussowitsch     PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense));
2839566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetPointer(cab_dense, ab_dense));
2849566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault));
2859566063dSJacob Faibussowitsch     PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense));
2869566063dSJacob Faibussowitsch     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
287d90d86d0SMatthew G. Knepley   }
2889566063dSJacob Faibussowitsch   PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
2899566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
290aa1aec99SHong Zhang 
291c124e916SHong Zhang   /* clean old values in C */
2929566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
293d50806bdSBarry Smith   /* Traverse A row-wise. */
294d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
295d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
296d50806bdSBarry Smith   for (i = 0; i < am; i++) {
297d50806bdSBarry Smith     anzi = ai[i + 1] - ai[i];
298d50806bdSBarry Smith     for (j = 0; j < anzi; j++) {
299519eb980SHong Zhang       brow = aj[j];
300d50806bdSBarry Smith       bnzi = bi[brow + 1] - bi[brow];
301d50806bdSBarry Smith       bjj  = bj + bi[brow];
302d50806bdSBarry Smith       baj  = ba + bi[brow];
303519eb980SHong Zhang       /* perform dense axpy */
30436ec6d2dSHong Zhang       valtmp = aa[j];
305ad540459SPierre Jolivet       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
306519eb980SHong Zhang       flops += 2 * bnzi;
307519eb980SHong Zhang     }
3088e3a54c0SPierre Jolivet     aj = PetscSafePointerPlusOffset(aj, anzi);
3098e3a54c0SPierre Jolivet     aa = PetscSafePointerPlusOffset(aa, anzi);
310c58ca2e3SHong Zhang 
311c58ca2e3SHong Zhang     cnzi = ci[i + 1] - ci[i];
312519eb980SHong Zhang     for (k = 0; k < cnzi; k++) {
313519eb980SHong Zhang       ca[k] += ab_dense[cj[k]];
314519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
315519eb980SHong Zhang     }
316519eb980SHong Zhang     flops += cnzi;
3178e3a54c0SPierre Jolivet     cj = PetscSafePointerPlusOffset(cj, cnzi);
3189371c9d4SSatish Balay     ca += cnzi;
319519eb980SHong Zhang   }
3202e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3212e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3222e5835c6SStefano Zampini #endif
3239566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3249566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3259566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
329c58ca2e3SHong Zhang }
330c58ca2e3SHong Zhang 
331d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
332d71ae5a4SJacob Faibussowitsch {
333c58ca2e3SHong Zhang   PetscLogDouble     flops = 0.0;
334c58ca2e3SHong Zhang   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
335c58ca2e3SHong Zhang   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
336c58ca2e3SHong Zhang   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
337c58ca2e3SHong Zhang   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
338c58ca2e3SHong Zhang   PetscInt           am = A->rmap->N, cm = C->rmap->N;
339c58ca2e3SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
3402e5835c6SStefano Zampini   PetscScalar       *ca = c->a, valtmp;
3412e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
342c58ca2e3SHong Zhang   PetscInt           nextb;
343c58ca2e3SHong Zhang 
344c58ca2e3SHong Zhang   PetscFunctionBegin;
3459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
3469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
347cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3489566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
349cf742fccSHong Zhang     c->a      = ca;
350cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
351cf742fccSHong Zhang   }
352cf742fccSHong Zhang 
353c58ca2e3SHong Zhang   /* clean old values in C */
3549566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
355c58ca2e3SHong Zhang   /* Traverse A row-wise. */
356c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
357c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
358519eb980SHong Zhang   for (i = 0; i < am; i++) {
359519eb980SHong Zhang     anzi = ai[i + 1] - ai[i];
360519eb980SHong Zhang     cnzi = ci[i + 1] - ci[i];
361519eb980SHong Zhang     for (j = 0; j < anzi; j++) {
362519eb980SHong Zhang       brow = aj[j];
363519eb980SHong Zhang       bnzi = bi[brow + 1] - bi[brow];
364519eb980SHong Zhang       bjj  = bj + bi[brow];
365519eb980SHong Zhang       baj  = ba + bi[brow];
366519eb980SHong Zhang       /* perform sparse axpy */
36736ec6d2dSHong Zhang       valtmp = aa[j];
368c124e916SHong Zhang       nextb  = 0;
369c124e916SHong Zhang       for (k = 0; nextb < bnzi; k++) {
370c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
37136ec6d2dSHong Zhang           ca[k] += valtmp * baj[nextb++];
372c124e916SHong Zhang         }
373d50806bdSBarry Smith       }
374d50806bdSBarry Smith       flops += 2 * bnzi;
375d50806bdSBarry Smith     }
3769371c9d4SSatish Balay     aj += anzi;
3779371c9d4SSatish Balay     aa += anzi;
3789371c9d4SSatish Balay     cj += cnzi;
3799371c9d4SSatish Balay     ca += cnzi;
380d50806bdSBarry Smith   }
3812e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3822e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3832e5835c6SStefano Zampini #endif
3849566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3859566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3869566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3889566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
390d50806bdSBarry Smith }
391bc011b1eSHong Zhang 
392d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
393d71ae5a4SJacob Faibussowitsch {
39425296bd5SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
39525296bd5SBarry Smith   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
3963c50cad2SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
39725296bd5SBarry Smith   MatScalar         *ca;
39825296bd5SBarry Smith   PetscReal          afill;
399eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
400eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
4010298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
40225296bd5SBarry Smith 
40325296bd5SBarry Smith   PetscFunctionBegin;
4043c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
4053c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4069566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
4073c50cad2SHong Zhang   ci[0] = 0;
4083c50cad2SHong Zhang 
4093c50cad2SHong Zhang   /* create and initialize a linked list */
410eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
411c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
412eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
413eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
414eca6b86aSHong Zhang 
4159566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
4163c50cad2SHong Zhang 
4173c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4189566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
4193c50cad2SHong Zhang   current_space = free_space;
4203c50cad2SHong Zhang 
4213c50cad2SHong Zhang   /* Determine ci and cj */
4223c50cad2SHong Zhang   for (i = 0; i < am; i++) {
4233c50cad2SHong Zhang     anzi = ai[i + 1] - ai[i];
4243c50cad2SHong Zhang     aj   = a->j + ai[i];
4253c50cad2SHong Zhang     for (j = 0; j < anzi; j++) {
4263c50cad2SHong Zhang       brow = aj[j];
4273c50cad2SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
4283c50cad2SHong Zhang       bj   = b->j + bi[brow];
4293c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4309566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
4313c50cad2SHong Zhang     }
4328c78258cSHong Zhang     /* add possible missing diagonal entry */
43348a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
4343c50cad2SHong Zhang     cnzi = lnk[1];
4353c50cad2SHong Zhang 
4363c50cad2SHong Zhang     /* If free space is not available, make more free space */
4373c50cad2SHong Zhang     /* Double the amount of total space in the list */
4383c50cad2SHong Zhang     if (current_space->local_remaining < cnzi) {
4399566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
4403c50cad2SHong Zhang       ndouble++;
4413c50cad2SHong Zhang     }
4423c50cad2SHong Zhang 
4433c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4449566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
4452205254eSKarl Rupp 
4463c50cad2SHong Zhang     current_space->array += cnzi;
4473c50cad2SHong Zhang     current_space->local_used += cnzi;
4483c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4492205254eSKarl Rupp 
4503c50cad2SHong Zhang     ci[i + 1] = ci[i] + cnzi;
4513c50cad2SHong Zhang   }
4523c50cad2SHong Zhang 
4533c50cad2SHong Zhang   /* Column indices are in the list of free space */
4543c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4553c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4569566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
4579566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
4589566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
45925296bd5SBarry Smith 
46025296bd5SBarry Smith   /* Allocate space for ca */
4619566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
46225296bd5SBarry Smith 
46325296bd5SBarry Smith   /* put together the new symbolic matrix */
4649566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
4659566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
46625296bd5SBarry Smith 
46725296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
46825296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
469*f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
47025296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
47125296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
47225296bd5SBarry Smith   c->nonew   = 0;
4732205254eSKarl Rupp 
4744222ddf1SHong Zhang   /* slower, less memory */
4754222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47625296bd5SBarry Smith 
47725296bd5SBarry Smith   /* set MatInfo */
47825296bd5SBarry Smith   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
47925296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4804222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4814222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4824222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
48325296bd5SBarry Smith 
48425296bd5SBarry Smith #if defined(PETSC_USE_INFO)
48525296bd5SBarry Smith   if (ci[am]) {
4869566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
4879566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
48825296bd5SBarry Smith   } else {
4899566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
49025296bd5SBarry Smith   }
49125296bd5SBarry Smith #endif
4923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
49325296bd5SBarry Smith }
49425296bd5SBarry Smith 
495d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
496d71ae5a4SJacob Faibussowitsch {
497e9e4536cSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
498bf9555e6SHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
49925c41797SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
500e9e4536cSHong Zhang   MatScalar         *ca;
501e9e4536cSHong Zhang   PetscReal          afill;
502eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
503eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
5040298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
505e9e4536cSHong Zhang 
506e9e4536cSHong Zhang   PetscFunctionBegin;
507bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
508bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
5099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
510bd958071SHong Zhang   ci[0] = 0;
511bd958071SHong Zhang 
512bd958071SHong Zhang   /* create and initialize a linked list */
513eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
514c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
515eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
516eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
5179566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
518bd958071SHong Zhang 
519bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5209566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
521bd958071SHong Zhang   current_space = free_space;
522bd958071SHong Zhang 
523bd958071SHong Zhang   /* Determine ci and cj */
524bd958071SHong Zhang   for (i = 0; i < am; i++) {
525bd958071SHong Zhang     anzi = ai[i + 1] - ai[i];
526bd958071SHong Zhang     aj   = a->j + ai[i];
527bd958071SHong Zhang     for (j = 0; j < anzi; j++) {
528bd958071SHong Zhang       brow = aj[j];
529bd958071SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
530bd958071SHong Zhang       bj   = b->j + bi[brow];
531bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5329566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
533bd958071SHong Zhang     }
5348c78258cSHong Zhang     /* add possible missing diagonal entry */
53548a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
5368c78258cSHong Zhang 
537bd958071SHong Zhang     cnzi = lnk[0];
538bd958071SHong Zhang 
539bd958071SHong Zhang     /* If free space is not available, make more free space */
540bd958071SHong Zhang     /* Double the amount of total space in the list */
541bd958071SHong Zhang     if (current_space->local_remaining < cnzi) {
5429566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
543bd958071SHong Zhang       ndouble++;
544bd958071SHong Zhang     }
545bd958071SHong Zhang 
546bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5479566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
5482205254eSKarl Rupp 
549bd958071SHong Zhang     current_space->array += cnzi;
550bd958071SHong Zhang     current_space->local_used += cnzi;
551bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5522205254eSKarl Rupp 
553bd958071SHong Zhang     ci[i + 1] = ci[i] + cnzi;
554bd958071SHong Zhang   }
555bd958071SHong Zhang 
556bd958071SHong Zhang   /* Column indices are in the list of free space */
557bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
558bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
5609566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
5619566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
562e9e4536cSHong Zhang 
563e9e4536cSHong Zhang   /* Allocate space for ca */
5649566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
565e9e4536cSHong Zhang 
566e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5679566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
5689566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
569e9e4536cSHong Zhang 
570e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
571e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
572*f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
573e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
574e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
575e9e4536cSHong Zhang   c->nonew   = 0;
5762205254eSKarl Rupp 
5774222ddf1SHong Zhang   /* slower, less memory */
5784222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
579e9e4536cSHong Zhang 
580e9e4536cSHong Zhang   /* set MatInfo */
581e9e4536cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
582e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5834222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5844222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5854222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
586e9e4536cSHong Zhang 
587e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
588e9e4536cSHong Zhang   if (ci[am]) {
5899566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
5909566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
591e9e4536cSHong Zhang   } else {
5929566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
593e9e4536cSHong Zhang   }
594e9e4536cSHong Zhang #endif
5953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
596e9e4536cSHong Zhang }
597e9e4536cSHong Zhang 
598d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
599d71ae5a4SJacob Faibussowitsch {
6000ced3a2bSJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
6010ced3a2bSJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
6020ced3a2bSJed Brown   PetscInt          *ci, *cj, *bb;
6030ced3a2bSJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
6040ced3a2bSJed Brown   PetscReal          afill;
6050ced3a2bSJed Brown   PetscInt           i, j, col, ndouble = 0;
6060298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
6070ced3a2bSJed Brown   PetscHeap          h;
6080ced3a2bSJed Brown 
6090ced3a2bSJed Brown   PetscFunctionBegin;
610cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6110ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
6130ced3a2bSJed Brown   ci[0] = 0;
6140ced3a2bSJed Brown 
6150ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6169566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
6170ced3a2bSJed Brown   current_space = free_space;
6180ced3a2bSJed Brown 
6199566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
6209566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
6210ced3a2bSJed Brown 
6220ced3a2bSJed Brown   /* Determine ci and cj */
6230ced3a2bSJed Brown   for (i = 0; i < am; i++) {
6240ced3a2bSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6250ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
6260ced3a2bSJed Brown     ci[i + 1]            = ci[i];
6270ced3a2bSJed Brown     /* Populate the min heap */
6280ced3a2bSJed Brown     for (j = 0; j < anzi; j++) {
6290ced3a2bSJed Brown       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
6300ced3a2bSJed Brown       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
6319566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
6320ced3a2bSJed Brown       }
6330ced3a2bSJed Brown     }
6340ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6359566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
6360ced3a2bSJed Brown     while (j >= 0) {
6370ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6389566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
6390ced3a2bSJed Brown         ndouble++;
6400ced3a2bSJed Brown       }
6410ced3a2bSJed Brown       *(current_space->array++) = col;
6420ced3a2bSJed Brown       current_space->local_used++;
6430ced3a2bSJed Brown       current_space->local_remaining--;
6440ced3a2bSJed Brown       ci[i + 1]++;
6450ced3a2bSJed Brown 
6460ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6479566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
6480ced3a2bSJed Brown       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
6490ced3a2bSJed Brown         PetscInt j2, col2;
6509566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h, &j2, &col2));
6510ced3a2bSJed Brown         if (col2 != col) break;
6529566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h, &j2, &col2));
6539566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
6540ced3a2bSJed Brown       }
6550ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6569566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6579566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
6580ced3a2bSJed Brown     }
6590ced3a2bSJed Brown   }
6609566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6619566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6620ced3a2bSJed Brown 
6630ced3a2bSJed Brown   /* Column indices are in the list of free space */
6640ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6650ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
6679566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
6680ced3a2bSJed Brown 
6690ced3a2bSJed Brown   /* put together the new symbolic matrix */
6709566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
6719566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6720ced3a2bSJed Brown 
6730ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6740ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
675*f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
6760ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6770ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6780ced3a2bSJed Brown   c->nonew   = 0;
67926fbe8dcSKarl Rupp 
6804222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6810ced3a2bSJed Brown 
6820ced3a2bSJed Brown   /* set MatInfo */
6830ced3a2bSJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
6840ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6854222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6864222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6874222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6880ced3a2bSJed Brown 
6890ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6900ced3a2bSJed Brown   if (ci[am]) {
6919566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
6929566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
6930ced3a2bSJed Brown   } else {
6949566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
6950ced3a2bSJed Brown   }
6960ced3a2bSJed Brown #endif
6973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6980ced3a2bSJed Brown }
699e9e4536cSHong Zhang 
700d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
701d71ae5a4SJacob Faibussowitsch {
7028a07c6f1SJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
7038a07c6f1SJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
7048a07c6f1SJed Brown   PetscInt          *ci, *cj, *bb;
7058a07c6f1SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
7068a07c6f1SJed Brown   PetscReal          afill;
7078a07c6f1SJed Brown   PetscInt           i, j, col, ndouble = 0;
7080298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
7098a07c6f1SJed Brown   PetscHeap          h;
7108a07c6f1SJed Brown   PetscBT            bt;
7118a07c6f1SJed Brown 
7128a07c6f1SJed Brown   PetscFunctionBegin;
713cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7148a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
7168a07c6f1SJed Brown   ci[0] = 0;
7178a07c6f1SJed Brown 
7188a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7199566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
7202205254eSKarl Rupp 
7218a07c6f1SJed Brown   current_space = free_space;
7228a07c6f1SJed Brown 
7239566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
7249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
7259566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn, &bt));
7268a07c6f1SJed Brown 
7278a07c6f1SJed Brown   /* Determine ci and cj */
7288a07c6f1SJed Brown   for (i = 0; i < am; i++) {
7298a07c6f1SJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7308a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
7318a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7328a07c6f1SJed Brown     ci[i + 1]            = ci[i];
7338a07c6f1SJed Brown     /* Populate the min heap */
7348a07c6f1SJed Brown     for (j = 0; j < anzi; j++) {
7358a07c6f1SJed Brown       PetscInt brow = acol[j];
7368a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
7378a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7388a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7399566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7408a07c6f1SJed Brown           bb[j]++;
7418a07c6f1SJed Brown           break;
7428a07c6f1SJed Brown         }
7438a07c6f1SJed Brown       }
7448a07c6f1SJed Brown     }
7458a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7469566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
7478a07c6f1SJed Brown     while (j >= 0) {
7488a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7490298fd71SBarry Smith         fptr = NULL;                            /* need PetscBTMemzero */
7509566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
7518a07c6f1SJed Brown         ndouble++;
7528a07c6f1SJed Brown       }
7538a07c6f1SJed Brown       *(current_space->array++) = col;
7548a07c6f1SJed Brown       current_space->local_used++;
7558a07c6f1SJed Brown       current_space->local_remaining--;
7568a07c6f1SJed Brown       ci[i + 1]++;
7578a07c6f1SJed Brown 
7588a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7598a07c6f1SJed Brown       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
7608a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7618a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7629566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7638a07c6f1SJed Brown           bb[j]++;
7648a07c6f1SJed Brown           break;
7658a07c6f1SJed Brown         }
7668a07c6f1SJed Brown       }
7679566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
7688a07c6f1SJed Brown     }
7698a07c6f1SJed Brown     if (fptr) { /* Clear the bits for this row */
7709566063dSJacob Faibussowitsch       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
7718a07c6f1SJed Brown     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7729566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn, bt));
7738a07c6f1SJed Brown     }
7748a07c6f1SJed Brown   }
7759566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7769566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7779566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7788a07c6f1SJed Brown 
7798a07c6f1SJed Brown   /* Column indices are in the list of free space */
7808a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7818a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
7839566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
7848a07c6f1SJed Brown 
7858a07c6f1SJed Brown   /* put together the new symbolic matrix */
7869566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
7879566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
7888a07c6f1SJed Brown 
7898a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7908a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
791*f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
7928a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7938a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7948a07c6f1SJed Brown   c->nonew   = 0;
79526fbe8dcSKarl Rupp 
7964222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7978a07c6f1SJed Brown 
7988a07c6f1SJed Brown   /* set MatInfo */
7998a07c6f1SJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
8008a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
8014222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8024222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8034222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8048a07c6f1SJed Brown 
8058a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8068a07c6f1SJed Brown   if (ci[am]) {
8079566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
8089566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
8098a07c6f1SJed Brown   } else {
8109566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
8118a07c6f1SJed Brown   }
8128a07c6f1SJed Brown #endif
8133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8148a07c6f1SJed Brown }
8158a07c6f1SJed Brown 
816d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
817d71ae5a4SJacob Faibussowitsch {
818d7ed1a4dSandi selinger   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
819d7ed1a4dSandi selinger   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
820d7ed1a4dSandi selinger   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
821d7ed1a4dSandi selinger   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
822d7ed1a4dSandi selinger   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
823d7ed1a4dSandi selinger   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
824d7ed1a4dSandi selinger   const PetscInt *brow_ptr[8], *brow_end[8];
825d7ed1a4dSandi selinger   PetscInt        window[8];
826d7ed1a4dSandi selinger   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
827d7ed1a4dSandi selinger   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
828d7ed1a4dSandi selinger   PetscReal       afill;
829f83700f2Sandi selinger   PetscInt       *workj_L1, *workj_L2, *workj_L3;
8307660c4dbSandi selinger   PetscInt        L1_nnz, L2_nnz;
831d7ed1a4dSandi selinger 
832d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
833d7ed1a4dSandi selinger              Because of the way virtual memory works,
834d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
835d7ed1a4dSandi selinger   PetscFunctionBegin;
8369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
837d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
838d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
839d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
840d7ed1a4dSandi selinger     a_rownnz             = 0;
841d7ed1a4dSandi selinger     for (k = 0; k < anzi; ++k) {
842d7ed1a4dSandi selinger       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
843d7ed1a4dSandi selinger       if (a_rownnz > bn) {
844d7ed1a4dSandi selinger         a_rownnz = bn;
845d7ed1a4dSandi selinger         break;
846d7ed1a4dSandi selinger       }
847d7ed1a4dSandi selinger     }
848d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
849d7ed1a4dSandi selinger   }
850d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8519566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
8529566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
8539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
854d7ed1a4dSandi selinger 
8557660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8567660c4dbSandi selinger   c_maxmem = 8 * (ai[am] + bi[bm]);
857d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem, &cj));
859d7ed1a4dSandi selinger 
860d7ed1a4dSandi selinger   ci_nnz      = 0;
861d7ed1a4dSandi selinger   ci[0]       = 0;
862d7ed1a4dSandi selinger   worki_L1[0] = 0;
863d7ed1a4dSandi selinger   worki_L2[0] = 0;
864d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
865d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
866d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
867d7ed1a4dSandi selinger     rowsleft             = anzi;
868d7ed1a4dSandi selinger     inputcol_L1          = acol;
869d7ed1a4dSandi selinger     L2_nnz               = 0;
8707660c4dbSandi selinger     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8717660c4dbSandi selinger     worki_L2[1]          = 0;
87208fe1b3cSKarl Rupp     outputi_nnz          = 0;
873d7ed1a4dSandi selinger 
874d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
875d7ed1a4dSandi selinger     while (ci_nnz + a_maxrownnz > c_maxmem) {
876d7ed1a4dSandi selinger       c_maxmem *= 2;
877d7ed1a4dSandi selinger       ndouble++;
8789566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
879d7ed1a4dSandi selinger     }
880d7ed1a4dSandi selinger 
881d7ed1a4dSandi selinger     while (rowsleft) {
882d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
883d7ed1a4dSandi selinger       L1_nrows    = 0;
8847660c4dbSandi selinger       L1_nnz      = 0;
885d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
886d7ed1a4dSandi selinger       inputi      = bi;
887d7ed1a4dSandi selinger       inputj      = bj;
888d7ed1a4dSandi selinger 
889d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
890d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
891f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
892d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
893d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
894a8f51744SPierre Jolivet   do { \
895d7ed1a4dSandi selinger     window_min  = bn; \
8967660c4dbSandi selinger     outputi_nnz = 0; \
8977660c4dbSandi selinger     for (k = 0; k < ANNZ; ++k) { \
898d7ed1a4dSandi selinger       brow_ptr[k] = inputj + inputi[inputcol[k]]; \
899d7ed1a4dSandi selinger       brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
900d7ed1a4dSandi selinger       window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
901d7ed1a4dSandi selinger       window_min  = PetscMin(window[k], window_min); \
902d7ed1a4dSandi selinger     } \
903d7ed1a4dSandi selinger     while (window_min < bn) { \
904d7ed1a4dSandi selinger       outputj[outputi_nnz++] = window_min; \
905d7ed1a4dSandi selinger       /* advance front and compute new minimum */ \
906d7ed1a4dSandi selinger       old_window_min = window_min; \
907d7ed1a4dSandi selinger       window_min     = bn; \
908d7ed1a4dSandi selinger       for (k = 0; k < ANNZ; ++k) { \
909d7ed1a4dSandi selinger         if (window[k] == old_window_min) { \
910d7ed1a4dSandi selinger           brow_ptr[k]++; \
911d7ed1a4dSandi selinger           window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
912d7ed1a4dSandi selinger         } \
913d7ed1a4dSandi selinger         window_min = PetscMin(window[k], window_min); \
914d7ed1a4dSandi selinger       } \
915a8f51744SPierre Jolivet     } \
916a8f51744SPierre Jolivet   } while (0)
917d7ed1a4dSandi selinger 
918d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
919d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
920d7ed1a4dSandi selinger       while (L1_rowsleft) {
9217660c4dbSandi selinger         outputi_nnz = 0;
9227660c4dbSandi selinger         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
9237660c4dbSandi selinger         else outputj = cj + ci_nnz;                /* Merge directly to C */
9247660c4dbSandi selinger 
925d7ed1a4dSandi selinger         switch (L1_rowsleft) {
9269371c9d4SSatish Balay         case 1:
9279371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
928d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
929d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
930d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
931d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
932d7ed1a4dSandi selinger           L1_rowsleft = 0;
933d7ed1a4dSandi selinger           break;
9349371c9d4SSatish Balay         case 2:
9359371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(2);
936d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
937d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
938d7ed1a4dSandi selinger           L1_rowsleft = 0;
939d7ed1a4dSandi selinger           break;
9409371c9d4SSatish Balay         case 3:
9419371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(3);
942d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
943d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
944d7ed1a4dSandi selinger           L1_rowsleft = 0;
945d7ed1a4dSandi selinger           break;
9469371c9d4SSatish Balay         case 4:
9479371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(4);
948d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
949d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
950d7ed1a4dSandi selinger           L1_rowsleft = 0;
951d7ed1a4dSandi selinger           break;
9529371c9d4SSatish Balay         case 5:
9539371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(5);
954d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
955d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
956d7ed1a4dSandi selinger           L1_rowsleft = 0;
957d7ed1a4dSandi selinger           break;
9589371c9d4SSatish Balay         case 6:
9599371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(6);
960d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
961d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
962d7ed1a4dSandi selinger           L1_rowsleft = 0;
963d7ed1a4dSandi selinger           break;
9649371c9d4SSatish Balay         case 7:
9659371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(7);
966d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
967d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
968d7ed1a4dSandi selinger           L1_rowsleft = 0;
969d7ed1a4dSandi selinger           break;
9709371c9d4SSatish Balay         default:
9719371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(8);
972d7ed1a4dSandi selinger           inputcol += 8;
973d7ed1a4dSandi selinger           rowsleft -= 8;
974d7ed1a4dSandi selinger           L1_rowsleft -= 8;
975d7ed1a4dSandi selinger           break;
976d7ed1a4dSandi selinger         }
977d7ed1a4dSandi selinger         inputcol_L1 = inputcol;
9787660c4dbSandi selinger         L1_nnz += outputi_nnz;
9797660c4dbSandi selinger         worki_L1[++L1_nrows] = L1_nnz;
980d7ed1a4dSandi selinger       }
981d7ed1a4dSandi selinger 
982d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
983d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
984d7ed1a4dSandi selinger       if (anzi > 8) {
985d7ed1a4dSandi selinger         inputi      = worki_L1;
986d7ed1a4dSandi selinger         inputj      = workj_L1;
987d7ed1a4dSandi selinger         inputcol    = workcol;
988d7ed1a4dSandi selinger         outputi_nnz = 0;
989d7ed1a4dSandi selinger 
990d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
991d7ed1a4dSandi selinger         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
992d7ed1a4dSandi selinger 
993d7ed1a4dSandi selinger         switch (L1_nrows) {
9949371c9d4SSatish Balay         case 1:
9959371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
996d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
997d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
998d7ed1a4dSandi selinger           break;
999d71ae5a4SJacob Faibussowitsch         case 2:
1000d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(2);
1001d71ae5a4SJacob Faibussowitsch           break;
1002d71ae5a4SJacob Faibussowitsch         case 3:
1003d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(3);
1004d71ae5a4SJacob Faibussowitsch           break;
1005d71ae5a4SJacob Faibussowitsch         case 4:
1006d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(4);
1007d71ae5a4SJacob Faibussowitsch           break;
1008d71ae5a4SJacob Faibussowitsch         case 5:
1009d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(5);
1010d71ae5a4SJacob Faibussowitsch           break;
1011d71ae5a4SJacob Faibussowitsch         case 6:
1012d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(6);
1013d71ae5a4SJacob Faibussowitsch           break;
1014d71ae5a4SJacob Faibussowitsch         case 7:
1015d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(7);
1016d71ae5a4SJacob Faibussowitsch           break;
1017d71ae5a4SJacob Faibussowitsch         case 8:
1018d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(8);
1019d71ae5a4SJacob Faibussowitsch           break;
1020d71ae5a4SJacob Faibussowitsch         default:
1021d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1022d7ed1a4dSandi selinger         }
1023d7ed1a4dSandi selinger         L2_nnz += outputi_nnz;
1024d7ed1a4dSandi selinger         worki_L2[++L2_nrows] = L2_nnz;
1025d7ed1a4dSandi selinger 
10267660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10277660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1028d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1029d7ed1a4dSandi selinger           inputi      = worki_L2;
1030d7ed1a4dSandi selinger           inputj      = workj_L2;
1031d7ed1a4dSandi selinger           inputcol    = workcol;
1032d7ed1a4dSandi selinger           outputi_nnz = 0;
1033f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1034d7ed1a4dSandi selinger           else outputj = cj + ci_nnz;
1035d7ed1a4dSandi selinger           switch (L2_nrows) {
10369371c9d4SSatish Balay           case 1:
10379371c9d4SSatish Balay             brow_ptr[0] = inputj + inputi[inputcol[0]];
1038d7ed1a4dSandi selinger             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1039d7ed1a4dSandi selinger             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1040d7ed1a4dSandi selinger             break;
1041d71ae5a4SJacob Faibussowitsch           case 2:
1042d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(2);
1043d71ae5a4SJacob Faibussowitsch             break;
1044d71ae5a4SJacob Faibussowitsch           case 3:
1045d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(3);
1046d71ae5a4SJacob Faibussowitsch             break;
1047d71ae5a4SJacob Faibussowitsch           case 4:
1048d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(4);
1049d71ae5a4SJacob Faibussowitsch             break;
1050d71ae5a4SJacob Faibussowitsch           case 5:
1051d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(5);
1052d71ae5a4SJacob Faibussowitsch             break;
1053d71ae5a4SJacob Faibussowitsch           case 6:
1054d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(6);
1055d71ae5a4SJacob Faibussowitsch             break;
1056d71ae5a4SJacob Faibussowitsch           case 7:
1057d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(7);
1058d71ae5a4SJacob Faibussowitsch             break;
1059d71ae5a4SJacob Faibussowitsch           case 8:
1060d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(8);
1061d71ae5a4SJacob Faibussowitsch             break;
1062d71ae5a4SJacob Faibussowitsch           default:
1063d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1064d7ed1a4dSandi selinger           }
1065d7ed1a4dSandi selinger           L2_nrows    = 1;
10667660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10677660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10687660c4dbSandi selinger           /* Copy to workj_L2 */
1069d7ed1a4dSandi selinger           if (rowsleft) {
10707660c4dbSandi selinger             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1071d7ed1a4dSandi selinger           }
1072d7ed1a4dSandi selinger         }
1073d7ed1a4dSandi selinger       }
1074d7ed1a4dSandi selinger     } /* while (rowsleft) */
1075d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1076d7ed1a4dSandi selinger 
1077d7ed1a4dSandi selinger     /* terminate current row */
1078d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1079d7ed1a4dSandi selinger     ci[i + 1] = ci_nnz;
1080d7ed1a4dSandi selinger   }
1081d7ed1a4dSandi selinger 
1082d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10839566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
10849566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1085d7ed1a4dSandi selinger 
1086d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1087d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1088*f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1089d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1090d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1091d7ed1a4dSandi selinger   c->nonew   = 0;
1092d7ed1a4dSandi selinger 
10934222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1094d7ed1a4dSandi selinger 
1095d7ed1a4dSandi selinger   /* set MatInfo */
1096d7ed1a4dSandi selinger   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1097d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10984222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10994222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11004222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1101d7ed1a4dSandi selinger 
1102d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1103d7ed1a4dSandi selinger   if (ci[am]) {
11049566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
11059566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1106d7ed1a4dSandi selinger   } else {
11079566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1108d7ed1a4dSandi selinger   }
1109d7ed1a4dSandi selinger #endif
1110d7ed1a4dSandi selinger 
1111d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
11129566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
11139566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
11149566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
11153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1116d7ed1a4dSandi selinger }
1117d7ed1a4dSandi selinger 
1118cd093f1dSJed Brown /* concatenate unique entries and then sort */
1119d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1120d71ae5a4SJacob Faibussowitsch {
1121cd093f1dSJed Brown   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1122cd093f1dSJed Brown   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
11238c78258cSHong Zhang   PetscInt       *ci, *cj, bcol;
1124cd093f1dSJed Brown   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1125cd093f1dSJed Brown   PetscReal       afill;
1126cd093f1dSJed Brown   PetscInt        i, j, ndouble = 0;
1127cd093f1dSJed Brown   PetscSegBuffer  seg, segrow;
1128cd093f1dSJed Brown   char           *seen;
1129cd093f1dSJed Brown 
1130cd093f1dSJed Brown   PetscFunctionBegin;
11319566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
1132cd093f1dSJed Brown   ci[0] = 0;
1133cd093f1dSJed Brown 
1134cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
11359566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
11369566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
11379566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn, &seen));
1138cd093f1dSJed Brown 
1139cd093f1dSJed Brown   /* Determine ci and cj */
1140cd093f1dSJed Brown   for (i = 0; i < am; i++) {
1141cd093f1dSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];                     /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
11428e3a54c0SPierre Jolivet     const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1143cd093f1dSJed Brown     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
11448c78258cSHong Zhang 
1145cd093f1dSJed Brown     /* Pack segrow */
1146cd093f1dSJed Brown     for (j = 0; j < anzi; j++) {
1147cd093f1dSJed Brown       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1148cd093f1dSJed Brown       for (k = bjstart; k < bjend; k++) {
11498c78258cSHong Zhang         bcol = bj[k];
1150cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1151cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11529566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1153cd093f1dSJed Brown           *slot      = bcol;
1154cd093f1dSJed Brown           seen[bcol] = 1;
1155cd093f1dSJed Brown           packlen++;
1156cd093f1dSJed Brown         }
1157cd093f1dSJed Brown       }
1158cd093f1dSJed Brown     }
11598c78258cSHong Zhang 
11608c78258cSHong Zhang     /* Check i-th diagonal entry */
11618c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11628c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11639566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
11648c78258cSHong Zhang       *slot   = i;
11658c78258cSHong Zhang       seen[i] = 1;
11668c78258cSHong Zhang       packlen++;
11678c78258cSHong Zhang     }
11688c78258cSHong Zhang 
11699566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
11709566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow, crow));
11719566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen, crow));
1172cd093f1dSJed Brown     ci[i + 1] = ci[i] + packlen;
1173cd093f1dSJed Brown     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1174cd093f1dSJed Brown   }
11759566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11769566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1177cd093f1dSJed Brown 
1178cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11799566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
11809566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1181cd093f1dSJed Brown 
1182cd093f1dSJed Brown   /* put together the new symbolic matrix */
11839566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
11849566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1185cd093f1dSJed Brown 
1186cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1187cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1188*f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1189cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1190cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1191cd093f1dSJed Brown   c->nonew   = 0;
1192cd093f1dSJed Brown 
11934222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1194cd093f1dSJed Brown 
1195cd093f1dSJed Brown   /* set MatInfo */
11962a09556fSStefano Zampini   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1197cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11984222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11994222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
12004222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1201cd093f1dSJed Brown 
1202cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1203cd093f1dSJed Brown   if (ci[am]) {
12049566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
12059566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1206cd093f1dSJed Brown   } else {
12079566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1208cd093f1dSJed Brown   }
1209cd093f1dSJed Brown #endif
12103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1211cd093f1dSJed Brown }
1212cd093f1dSJed Brown 
121366976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1214d71ae5a4SJacob Faibussowitsch {
12156718818eSStefano Zampini   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
12162128a86cSHong Zhang 
12172128a86cSHong Zhang   PetscFunctionBegin;
12189566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
12199566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
12209566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
12219566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
12223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12232128a86cSHong Zhang }
12242128a86cSHong Zhang 
1225d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1226d71ae5a4SJacob Faibussowitsch {
122781d82fe4SHong Zhang   Mat                  Bt;
122840192850SHong Zhang   Mat_MatMatTransMult *abt;
12294222ddf1SHong Zhang   Mat_Product         *product = C->product;
12306718818eSStefano Zampini   char                *alg;
1231d2853540SHong Zhang 
123281d82fe4SHong Zhang   PetscFunctionBegin;
123328b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
123428b400f6SJacob Faibussowitsch   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
12356718818eSStefano Zampini 
123681d82fe4SHong Zhang   /* create symbolic Bt */
12377fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B, &Bt));
12389566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
12399566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
124081d82fe4SHong Zhang 
124181d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12429566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg, &alg));
12439566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
12449566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
12459566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
12469566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
124781d82fe4SHong Zhang 
1248a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12499566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12502128a86cSHong Zhang 
12516718818eSStefano Zampini   product->data    = abt;
12526718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12536718818eSStefano Zampini 
12544222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12552128a86cSHong Zhang 
12564222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12579566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
125840192850SHong Zhang   if (abt->usecoloring) {
1259b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1260b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1261335efc43SPeter Brune     MatColoring          coloring;
12622128a86cSHong Zhang     ISColoring           iscoloring;
12632128a86cSHong Zhang     Mat                  Bt_dense, C_dense;
1264e8354b3eSHong Zhang 
12654222ddf1SHong Zhang     /* inode causes memory problem */
12669566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
12674222ddf1SHong Zhang 
12689566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C, &coloring));
12699566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring, 2));
12709566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
12719566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12729566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring, &iscoloring));
12739566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12749566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
12752205254eSKarl Rupp 
127640192850SHong Zhang     abt->matcoloring = matcoloring;
12772205254eSKarl Rupp 
12789566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12792128a86cSHong Zhang 
12802128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12819566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
12829566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
12839566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
12849566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
12852205254eSKarl Rupp 
12862128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
128740192850SHong Zhang     abt->Bt_den         = Bt_dense;
12882128a86cSHong Zhang 
12899566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
12909566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
12919566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense, MATSEQDENSE));
12929566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
12932205254eSKarl Rupp 
12942128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
129540192850SHong Zhang     abt->ABt_den        = C_dense;
1296f94ccd6cSHong Zhang 
1297f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12981ce71dffSSatish Balay     {
12994222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
13009371c9d4SSatish Balay       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1301*f4f49eeaSPierre Jolivet                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
13021ce71dffSSatish Balay     }
1303f94ccd6cSHong Zhang #endif
13042128a86cSHong Zhang   }
1305e99be685SHong Zhang   /* clean up */
13069566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
13073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13085df89d91SHong Zhang }
13095df89d91SHong Zhang 
1310d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1311d71ae5a4SJacob Faibussowitsch {
13125df89d91SHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1313e2cac8caSJed Brown   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
131481d82fe4SHong Zhang   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
13155df89d91SHong Zhang   PetscLogDouble       flops = 0.0;
1316aa1aec99SHong Zhang   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
13176718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13186718818eSStefano Zampini   Mat_Product         *product = C->product;
13195df89d91SHong Zhang 
13205df89d91SHong Zhang   PetscFunctionBegin;
132128b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
13226718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
132328b400f6SJacob Faibussowitsch   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
132458ed3ceaSHong Zhang   /* clear old values in C */
132558ed3ceaSHong Zhang   if (!c->a) {
13269566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
132758ed3ceaSHong Zhang     c->a      = ca;
132858ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
132958ed3ceaSHong Zhang   } else {
133058ed3ceaSHong Zhang     ca = c->a;
13319566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
133258ed3ceaSHong Zhang   }
133358ed3ceaSHong Zhang 
133440192850SHong Zhang   if (abt->usecoloring) {
133540192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
133640192850SHong Zhang     Mat                  Bt_dense, C_dense = abt->ABt_den;
1337c8db22f6SHong Zhang 
1338b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
133940192850SHong Zhang     Bt_dense = abt->Bt_den;
13409566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1341c8db22f6SHong Zhang 
1342c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13439566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1344c8db22f6SHong Zhang 
13452128a86cSHong Zhang     /* Recover C from C_dense */
13469566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
13473ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1348c8db22f6SHong Zhang   }
1349c8db22f6SHong Zhang 
13501fa1209cSHong Zhang   for (i = 0; i < cm; i++) {
135181d82fe4SHong Zhang     anzi = ai[i + 1] - ai[i];
13528e3a54c0SPierre Jolivet     acol = PetscSafePointerPlusOffset(aj, ai[i]);
13538e3a54c0SPierre Jolivet     aval = PetscSafePointerPlusOffset(aa, ai[i]);
13541fa1209cSHong Zhang     cnzi = ci[i + 1] - ci[i];
13558e3a54c0SPierre Jolivet     ccol = PetscSafePointerPlusOffset(cj, ci[i]);
13566973769bSHong Zhang     cval = ca + ci[i];
13571fa1209cSHong Zhang     for (j = 0; j < cnzi; j++) {
135881d82fe4SHong Zhang       brow = ccol[j];
135981d82fe4SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
136081d82fe4SHong Zhang       bcol = bj + bi[brow];
13616973769bSHong Zhang       bval = ba + bi[brow];
13626973769bSHong Zhang 
136381d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
13649371c9d4SSatish Balay       nexta = 0;
13659371c9d4SSatish Balay       nextb = 0;
136681d82fe4SHong Zhang       while (nexta < anzi && nextb < bnzj) {
13677b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
136881d82fe4SHong Zhang         if (nexta == anzi) break;
13697b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
137081d82fe4SHong Zhang         if (nextb == bnzj) break;
137181d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13726973769bSHong Zhang           cval[j] += aval[nexta] * bval[nextb];
13739371c9d4SSatish Balay           nexta++;
13749371c9d4SSatish Balay           nextb++;
137581d82fe4SHong Zhang           flops += 2;
13761fa1209cSHong Zhang         }
13771fa1209cSHong Zhang       }
137881d82fe4SHong Zhang     }
137981d82fe4SHong Zhang   }
13809566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
13819566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
13829566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13845df89d91SHong Zhang }
13855df89d91SHong Zhang 
1386d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1387d71ae5a4SJacob Faibussowitsch {
13886718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
13896d373c3eSHong Zhang 
13906d373c3eSHong Zhang   PetscFunctionBegin;
13919566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
13921baa6e33SBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
13939566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13956d373c3eSHong Zhang }
13966d373c3eSHong Zhang 
1397d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1398d71ae5a4SJacob Faibussowitsch {
1399089a957eSStefano Zampini   Mat          At      = NULL;
14004222ddf1SHong Zhang   Mat_Product *product = C->product;
1401089a957eSStefano Zampini   PetscBool    flg, def, square;
1402bc011b1eSHong Zhang 
1403bc011b1eSHong Zhang   PetscFunctionBegin;
1404089a957eSStefano Zampini   MatCheckProduct(C, 4);
1405b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
14064222ddf1SHong Zhang   /* outerproduct */
14079566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
14084222ddf1SHong Zhang   if (flg) {
1409bc011b1eSHong Zhang     /* create symbolic At */
1410089a957eSStefano Zampini     if (!square) {
14117fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A, &At));
14129566063dSJacob Faibussowitsch       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
14139566063dSJacob Faibussowitsch       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1414089a957eSStefano Zampini     }
1415bc011b1eSHong Zhang     /* get symbolic C=At*B */
14169566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14179566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1418bc011b1eSHong Zhang 
1419bc011b1eSHong Zhang     /* clean up */
142048a46eb9SPierre Jolivet     if (!square) PetscCall(MatDestroy(&At));
14216d373c3eSHong Zhang 
14224222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14239566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
14243ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14254222ddf1SHong Zhang   }
14264222ddf1SHong Zhang 
14274222ddf1SHong Zhang   /* matmatmult */
14289566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &def));
14299566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1430089a957eSStefano Zampini   if (flg || def) {
14314222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14324222ddf1SHong Zhang 
143328b400f6SJacob Faibussowitsch     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
14349566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
143548a46eb9SPierre Jolivet     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
14369566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14379566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
14389566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "at*b"));
14396718818eSStefano Zampini     product->data    = atb;
14406718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14414222ddf1SHong Zhang     atb->At          = At;
14424222ddf1SHong Zhang 
14434222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14443ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14454222ddf1SHong Zhang   }
14464222ddf1SHong Zhang 
14474222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1448bc011b1eSHong Zhang }
1449bc011b1eSHong Zhang 
1450d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1451d71ae5a4SJacob Faibussowitsch {
14520fbc74f4SHong Zhang   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1453d0f46423SBarry Smith   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1454d0f46423SBarry Smith   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1455d13dce4bSSatish Balay   PetscLogDouble flops = 0.0;
1456aa1aec99SHong Zhang   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1457bc011b1eSHong Zhang 
1458bc011b1eSHong Zhang   PetscFunctionBegin;
1459aa1aec99SHong Zhang   if (!c->a) {
14609566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
14612205254eSKarl Rupp 
1462aa1aec99SHong Zhang     c->a      = ca;
1463aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1464aa1aec99SHong Zhang   } else {
1465aa1aec99SHong Zhang     ca = c->a;
14669566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm]));
1467aa1aec99SHong Zhang   }
1468bc011b1eSHong Zhang 
1469bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1470bc011b1eSHong Zhang   for (i = 0; i < am; i++) {
1471bc011b1eSHong Zhang     bj   = b->j + bi[i];
1472bc011b1eSHong Zhang     ba   = b->a + bi[i];
1473bc011b1eSHong Zhang     bnzi = bi[i + 1] - bi[i];
1474bc011b1eSHong Zhang     anzi = ai[i + 1] - ai[i];
1475bc011b1eSHong Zhang     for (j = 0; j < anzi; j++) {
1476bc011b1eSHong Zhang       nextb = 0;
14770fbc74f4SHong Zhang       crow  = *aj++;
1478bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1479bc011b1eSHong Zhang       caj   = ca + ci[crow];
1480bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1481bc011b1eSHong Zhang       for (k = 0; nextb < bnzi; k++) {
14820fbc74f4SHong Zhang         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
14830fbc74f4SHong Zhang           caj[k] += (*aa) * (*(ba + nextb));
1484bc011b1eSHong Zhang           nextb++;
1485bc011b1eSHong Zhang         }
1486bc011b1eSHong Zhang       }
1487bc011b1eSHong Zhang       flops += 2 * bnzi;
14880fbc74f4SHong Zhang       aa++;
1489bc011b1eSHong Zhang     }
1490bc011b1eSHong Zhang   }
1491bc011b1eSHong Zhang 
1492bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14939566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
14949566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
14959566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
14963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1497bc011b1eSHong Zhang }
14989123193aSHong Zhang 
1499d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1500d71ae5a4SJacob Faibussowitsch {
15019123193aSHong Zhang   PetscFunctionBegin;
15029566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15034222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
15043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
15059123193aSHong Zhang }
15069123193aSHong Zhang 
1507d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1508d71ae5a4SJacob Faibussowitsch {
1509f32d5d43SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
15101ca9667aSStefano Zampini   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1511a4af7ca8SStefano Zampini   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1512daf57211SHong Zhang   const PetscInt    *aj;
151375f6d85dSStefano Zampini   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
151475f6d85dSStefano Zampini   PetscInt           clda;
151575f6d85dSStefano Zampini   PetscInt           am4, bm4, col, i, j, n;
15169123193aSHong Zhang 
15179123193aSHong Zhang   PetscFunctionBegin;
15183ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
15199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &av));
152093aa15f2SStefano Zampini   if (add) {
15219566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C, &c));
152293aa15f2SStefano Zampini   } else {
15239566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C, &c));
152493aa15f2SStefano Zampini   }
15259566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B, &b));
15269566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &bm));
15279566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C, &clda));
152875f6d85dSStefano Zampini   am4 = 4 * clda;
152975f6d85dSStefano Zampini   bm4 = 4 * bm;
15305c0db29aSPierre Jolivet   if (b) {
15319371c9d4SSatish Balay     b1 = b;
15329371c9d4SSatish Balay     b2 = b1 + bm;
15339371c9d4SSatish Balay     b3 = b2 + bm;
15349371c9d4SSatish Balay     b4 = b3 + bm;
15355c0db29aSPierre Jolivet   } else b1 = b2 = b3 = b4 = NULL;
15369371c9d4SSatish Balay   c1 = c;
15379371c9d4SSatish Balay   c2 = c1 + clda;
15389371c9d4SSatish Balay   c3 = c2 + clda;
15399371c9d4SSatish Balay   c4 = c3 + clda;
15401ca9667aSStefano Zampini   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
15411ca9667aSStefano Zampini     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1542f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1543f32d5d43SBarry Smith       n                 = a->i[i + 1] - a->i[i];
15448e3a54c0SPierre Jolivet       aj                = PetscSafePointerPlusOffset(a->j, a->i[i]);
15458e3a54c0SPierre Jolivet       aa                = PetscSafePointerPlusOffset(av, a->i[i]);
1546f32d5d43SBarry Smith       for (j = 0; j < n; j++) {
15471ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15481ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1549730858b9SHong Zhang         r1 += aatmp * b1[ajtmp];
1550730858b9SHong Zhang         r2 += aatmp * b2[ajtmp];
1551730858b9SHong Zhang         r3 += aatmp * b3[ajtmp];
1552730858b9SHong Zhang         r4 += aatmp * b4[ajtmp];
15539123193aSHong Zhang       }
155493aa15f2SStefano Zampini       if (add) {
155587753ddeSHong Zhang         c1[i] += r1;
155687753ddeSHong Zhang         c2[i] += r2;
155787753ddeSHong Zhang         c3[i] += r3;
155887753ddeSHong Zhang         c4[i] += r4;
155993aa15f2SStefano Zampini       } else {
156093aa15f2SStefano Zampini         c1[i] = r1;
156193aa15f2SStefano Zampini         c2[i] = r2;
156293aa15f2SStefano Zampini         c3[i] = r3;
156393aa15f2SStefano Zampini         c4[i] = r4;
156493aa15f2SStefano Zampini       }
1565f32d5d43SBarry Smith     }
15665c0db29aSPierre Jolivet     if (b) {
15679371c9d4SSatish Balay       b1 += bm4;
15689371c9d4SSatish Balay       b2 += bm4;
15699371c9d4SSatish Balay       b3 += bm4;
15709371c9d4SSatish Balay       b4 += bm4;
15715c0db29aSPierre Jolivet     }
15729371c9d4SSatish Balay     c1 += am4;
15739371c9d4SSatish Balay     c2 += am4;
15749371c9d4SSatish Balay     c3 += am4;
15759371c9d4SSatish Balay     c4 += am4;
1576f32d5d43SBarry Smith   }
157793aa15f2SStefano Zampini   /* process remaining columns */
157893aa15f2SStefano Zampini   if (col != cn) {
157993aa15f2SStefano Zampini     PetscInt rc = cn - col;
158093aa15f2SStefano Zampini 
158193aa15f2SStefano Zampini     if (rc == 1) {
158293aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
1583f32d5d43SBarry Smith         r1 = 0.0;
1584f32d5d43SBarry Smith         n  = a->i[i + 1] - a->i[i];
15858e3a54c0SPierre Jolivet         aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
15868e3a54c0SPierre Jolivet         aa = PetscSafePointerPlusOffset(av, a->i[i]);
158793aa15f2SStefano Zampini         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
158893aa15f2SStefano Zampini         if (add) c1[i] += r1;
158993aa15f2SStefano Zampini         else c1[i] = r1;
159093aa15f2SStefano Zampini       }
159193aa15f2SStefano Zampini     } else if (rc == 2) {
159293aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
159393aa15f2SStefano Zampini         r1 = r2 = 0.0;
159493aa15f2SStefano Zampini         n       = a->i[i + 1] - a->i[i];
15958e3a54c0SPierre Jolivet         aj      = PetscSafePointerPlusOffset(a->j, a->i[i]);
15968e3a54c0SPierre Jolivet         aa      = PetscSafePointerPlusOffset(av, a->i[i]);
1597f32d5d43SBarry Smith         for (j = 0; j < n; j++) {
159893aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
159993aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
160093aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
160193aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
1602f32d5d43SBarry Smith         }
160393aa15f2SStefano Zampini         if (add) {
160487753ddeSHong Zhang           c1[i] += r1;
160593aa15f2SStefano Zampini           c2[i] += r2;
160693aa15f2SStefano Zampini         } else {
160793aa15f2SStefano Zampini           c1[i] = r1;
160893aa15f2SStefano Zampini           c2[i] = r2;
1609f32d5d43SBarry Smith         }
161093aa15f2SStefano Zampini       }
161193aa15f2SStefano Zampini     } else {
161293aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
161393aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
161493aa15f2SStefano Zampini         n            = a->i[i + 1] - a->i[i];
16158e3a54c0SPierre Jolivet         aj           = PetscSafePointerPlusOffset(a->j, a->i[i]);
16168e3a54c0SPierre Jolivet         aa           = PetscSafePointerPlusOffset(av, a->i[i]);
161793aa15f2SStefano Zampini         for (j = 0; j < n; j++) {
161893aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
161993aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
162093aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
162193aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
162293aa15f2SStefano Zampini           r3 += aatmp * b3[ajtmp];
162393aa15f2SStefano Zampini         }
162493aa15f2SStefano Zampini         if (add) {
162593aa15f2SStefano Zampini           c1[i] += r1;
162693aa15f2SStefano Zampini           c2[i] += r2;
162793aa15f2SStefano Zampini           c3[i] += r3;
162893aa15f2SStefano Zampini         } else {
162993aa15f2SStefano Zampini           c1[i] = r1;
163093aa15f2SStefano Zampini           c2[i] = r2;
163193aa15f2SStefano Zampini           c3[i] = r3;
163293aa15f2SStefano Zampini         }
163393aa15f2SStefano Zampini       }
163493aa15f2SStefano Zampini     }
1635f32d5d43SBarry Smith   }
16369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
163793aa15f2SStefano Zampini   if (add) {
16389566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C, &c));
163993aa15f2SStefano Zampini   } else {
16409566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C, &c));
164193aa15f2SStefano Zampini   }
16429566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B, &b));
16439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
16443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1645f32d5d43SBarry Smith }
1646f32d5d43SBarry Smith 
1647d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1648d71ae5a4SJacob Faibussowitsch {
1649f32d5d43SBarry Smith   PetscFunctionBegin;
165008401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
165108401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
165208401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
16534324174eSBarry Smith 
16549566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
16553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16569123193aSHong Zhang }
1657b1683b59SHong Zhang 
1658d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1659d71ae5a4SJacob Faibussowitsch {
16604222ddf1SHong Zhang   PetscFunctionBegin;
16614222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16624222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16644222ddf1SHong Zhang }
16654222ddf1SHong Zhang 
16666718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
16676718818eSStefano Zampini 
1668d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1669d71ae5a4SJacob Faibussowitsch {
16704222ddf1SHong Zhang   PetscFunctionBegin;
16716718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16724222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16746718818eSStefano Zampini }
16756718818eSStefano Zampini 
1676d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1677d71ae5a4SJacob Faibussowitsch {
16786718818eSStefano Zampini   PetscFunctionBegin;
16796718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16806718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16824222ddf1SHong Zhang }
16834222ddf1SHong Zhang 
1684d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1685d71ae5a4SJacob Faibussowitsch {
16864222ddf1SHong Zhang   Mat_Product *product = C->product;
16874222ddf1SHong Zhang 
16884222ddf1SHong Zhang   PetscFunctionBegin;
16894222ddf1SHong Zhang   switch (product->type) {
1690d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
1691d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1692d71ae5a4SJacob Faibussowitsch     break;
1693d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
1694d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1695d71ae5a4SJacob Faibussowitsch     break;
1696d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
1697d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1698d71ae5a4SJacob Faibussowitsch     break;
1699d71ae5a4SJacob Faibussowitsch   default:
1700d71ae5a4SJacob Faibussowitsch     break;
17014222ddf1SHong Zhang   }
17023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17034222ddf1SHong Zhang }
17042ef1f0ffSBarry Smith 
1705d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1706d71ae5a4SJacob Faibussowitsch {
17074222ddf1SHong Zhang   Mat_Product *product = C->product;
17084222ddf1SHong Zhang   Mat          A       = product->A;
17094222ddf1SHong Zhang   PetscBool    baij;
17104222ddf1SHong Zhang 
17114222ddf1SHong Zhang   PetscFunctionBegin;
17129566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
17134222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
17144222ddf1SHong Zhang     PetscBool sbaij;
17159566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
171628b400f6SJacob Faibussowitsch     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
17174222ddf1SHong Zhang 
17184222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17194222ddf1SHong Zhang   } else { /* A is seqbaij */
17204222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17214222ddf1SHong Zhang   }
17224222ddf1SHong Zhang 
17234222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17254222ddf1SHong Zhang }
17264222ddf1SHong Zhang 
1727d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1728d71ae5a4SJacob Faibussowitsch {
17294222ddf1SHong Zhang   Mat_Product *product = C->product;
17304222ddf1SHong Zhang 
17314222ddf1SHong Zhang   PetscFunctionBegin;
17326718818eSStefano Zampini   MatCheckProduct(C, 1);
173328b400f6SJacob Faibussowitsch   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1734b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
17353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17364222ddf1SHong Zhang }
17376718818eSStefano Zampini 
1738d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1739d71ae5a4SJacob Faibussowitsch {
17404222ddf1SHong Zhang   PetscFunctionBegin;
17414222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17424222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17444222ddf1SHong Zhang }
17454222ddf1SHong Zhang 
1746d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1747d71ae5a4SJacob Faibussowitsch {
17484222ddf1SHong Zhang   Mat_Product *product = C->product;
17494222ddf1SHong Zhang 
17504222ddf1SHong Zhang   PetscFunctionBegin;
175148a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
17523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17534222ddf1SHong Zhang }
17544222ddf1SHong Zhang 
1755d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1756d71ae5a4SJacob Faibussowitsch {
17572f5f1f90SHong Zhang   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
17582f5f1f90SHong Zhang   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
17592f5f1f90SHong Zhang   PetscInt     *bi = b->i, *bj = b->j;
17602f5f1f90SHong Zhang   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
17612f5f1f90SHong Zhang   MatScalar    *btval, *btval_den, *ba = b->a;
17622f5f1f90SHong Zhang   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1763c8db22f6SHong Zhang 
1764c8db22f6SHong Zhang   PetscFunctionBegin;
17652f5f1f90SHong Zhang   btval_den = btdense->v;
17669566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den, m * n));
17672f5f1f90SHong Zhang   for (k = 0; k < ncolors; k++) {
17682f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17692f5f1f90SHong Zhang     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1770d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17712f5f1f90SHong Zhang       btcol = bj + bi[col];
17722f5f1f90SHong Zhang       btval = ba + bi[col];
17732f5f1f90SHong Zhang       anz   = bi[col + 1] - bi[col];
1774d2853540SHong Zhang       for (j = 0; j < anz; j++) {
17752f5f1f90SHong Zhang         brow            = btcol[j];
17762f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1777c8db22f6SHong Zhang       }
1778c8db22f6SHong Zhang     }
17792f5f1f90SHong Zhang     btval_den += m;
1780c8db22f6SHong Zhang   }
17813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1782c8db22f6SHong Zhang }
1783c8db22f6SHong Zhang 
1784d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1785d71ae5a4SJacob Faibussowitsch {
1786b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
17871683a169SBarry Smith   const PetscScalar *ca_den, *ca_den_ptr;
17881683a169SBarry Smith   PetscScalar       *ca = csp->a;
1789f99a636bSHong Zhang   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1790e88777f2SHong Zhang   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1791077f23c2SHong Zhang   PetscInt           nrows, *row, *idx;
1792077f23c2SHong Zhang   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
17938972f759SHong Zhang 
17948972f759SHong Zhang   PetscFunctionBegin;
17959566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1796f99a636bSHong Zhang 
1797077f23c2SHong Zhang   if (brows > 0) {
1798077f23c2SHong Zhang     PetscInt *lstart, row_end, row_start;
1799980ae229SHong Zhang     lstart = matcoloring->lstart;
18009566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart, ncolors));
1801eeb4fd02SHong Zhang 
1802077f23c2SHong Zhang     row_end = brows;
1803eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1804077f23c2SHong Zhang     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1805077f23c2SHong Zhang       ca_den_ptr = ca_den;
1806980ae229SHong Zhang       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1807eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1808eeb4fd02SHong Zhang         row   = rows + colorforrow[k];
1809eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1810eeb4fd02SHong Zhang         for (l = lstart[k]; l < nrows; l++) {
1811eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1812eeb4fd02SHong Zhang             lstart[k] = l;
1813eeb4fd02SHong Zhang             break;
1814eeb4fd02SHong Zhang           } else {
1815077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1816eeb4fd02SHong Zhang           }
1817eeb4fd02SHong Zhang         }
1818077f23c2SHong Zhang         ca_den_ptr += m;
1819eeb4fd02SHong Zhang       }
1820077f23c2SHong Zhang       row_end += brows;
1821eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1822eeb4fd02SHong Zhang     }
1823077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1824077f23c2SHong Zhang     ca_den_ptr = ca_den;
1825077f23c2SHong Zhang     for (k = 0; k < ncolors; k++) {
1826077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1827077f23c2SHong Zhang       row   = rows + colorforrow[k];
1828077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1829ad540459SPierre Jolivet       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1830077f23c2SHong Zhang       ca_den_ptr += m;
1831077f23c2SHong Zhang     }
1832f99a636bSHong Zhang   }
1833f99a636bSHong Zhang 
18349566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1835f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1836077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18379566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1838e88777f2SHong Zhang   } else {
18399566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1840e88777f2SHong Zhang   }
1841f99a636bSHong Zhang #endif
18423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18438972f759SHong Zhang }
18448972f759SHong Zhang 
1845d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1846d71ae5a4SJacob Faibussowitsch {
1847e88777f2SHong Zhang   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
18481a83f524SJed Brown   const PetscInt *is, *ci, *cj, *row_idx;
1849b28d80bdSHong Zhang   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1850b1683b59SHong Zhang   IS             *isa;
1851b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1852e88777f2SHong Zhang   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1853e88777f2SHong Zhang   PetscInt       *colorforcol, *columns, *columns_i, brows;
1854e88777f2SHong Zhang   PetscBool       flg;
1855b1683b59SHong Zhang 
1856b1683b59SHong Zhang   PetscFunctionBegin;
18579566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1858e99be685SHong Zhang 
18597ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1860e88777f2SHong Zhang   Nbs       = mat->cmap->N / bs;
1861b1683b59SHong Zhang   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1862e88777f2SHong Zhang   c->N      = Nbs;
1863e88777f2SHong Zhang   c->m      = c->M;
1864b1683b59SHong Zhang   c->rstart = 0;
1865e88777f2SHong Zhang   c->brows  = 100;
1866b1683b59SHong Zhang 
1867b1683b59SHong Zhang   c->ncolors = nis;
18689566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
18699566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
18709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1871e88777f2SHong Zhang 
1872e88777f2SHong Zhang   brows = c->brows;
18739566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1874e88777f2SHong Zhang   if (flg) c->brows = brows;
187548a46eb9SPierre Jolivet   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
18762205254eSKarl Rupp 
1877d2853540SHong Zhang   colorforrow[0] = 0;
1878d2853540SHong Zhang   rows_i         = rows;
1879f99a636bSHong Zhang   den2sp_i       = den2sp;
1880b1683b59SHong Zhang 
18819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
18829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs + 1, &columns));
18832205254eSKarl Rupp 
1884d2853540SHong Zhang   colorforcol[0] = 0;
1885d2853540SHong Zhang   columns_i      = columns;
1886d2853540SHong Zhang 
1887077f23c2SHong Zhang   /* get column-wise storage of mat */
18889566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1889b1683b59SHong Zhang 
1890b28d80bdSHong Zhang   cm = c->m;
18919566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &rowhit));
18929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1893f99a636bSHong Zhang   for (i = 0; i < nis; i++) { /* loop over color */
18949566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i], &n));
18959566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i], &is));
18962205254eSKarl Rupp 
1897b1683b59SHong Zhang     c->ncolumns[i] = n;
18981baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1899d2853540SHong Zhang     colorforcol[i + 1] = colorforcol[i] + n;
1900d2853540SHong Zhang     columns_i += n;
1901b1683b59SHong Zhang 
1902b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
19039566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit, cm));
1904e99be685SHong Zhang 
1905b7caf3d6SHong Zhang     for (j = 0; j < n; j++) { /* loop over columns*/
1906b1683b59SHong Zhang       col     = is[j];
1907d2853540SHong Zhang       row_idx = cj + ci[col];
1908b1683b59SHong Zhang       m       = ci[col + 1] - ci[col];
1909b7caf3d6SHong Zhang       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1910e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1911d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1912b1683b59SHong Zhang       }
1913b1683b59SHong Zhang     }
1914b1683b59SHong Zhang     /* count the number of hits */
1915b1683b59SHong Zhang     nrows = 0;
1916e8354b3eSHong Zhang     for (j = 0; j < cm; j++) {
1917b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1918b1683b59SHong Zhang     }
1919b1683b59SHong Zhang     c->nrows[i]        = nrows;
1920d2853540SHong Zhang     colorforrow[i + 1] = colorforrow[i] + nrows;
1921d2853540SHong Zhang 
1922b1683b59SHong Zhang     nrows = 0;
1923b7caf3d6SHong Zhang     for (j = 0; j < cm; j++) { /* loop over rows */
1924b1683b59SHong Zhang       if (rowhit[j]) {
1925d2853540SHong Zhang         rows_i[nrows]   = j;
192612b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1927b1683b59SHong Zhang         nrows++;
1928b1683b59SHong Zhang       }
1929b1683b59SHong Zhang     }
1930e88777f2SHong Zhang     den2sp_i += nrows;
1931077f23c2SHong Zhang 
19329566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i], &is));
1933f99a636bSHong Zhang     rows_i += nrows;
1934b1683b59SHong Zhang   }
19359566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
19369566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
19379566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
19382c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1939b28d80bdSHong Zhang 
1940d2853540SHong Zhang   c->colorforrow = colorforrow;
1941d2853540SHong Zhang   c->rows        = rows;
1942f99a636bSHong Zhang   c->den2sp      = den2sp;
1943d2853540SHong Zhang   c->colorforcol = colorforcol;
1944d2853540SHong Zhang   c->columns     = columns;
19452205254eSKarl Rupp 
19469566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
19473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1948b1683b59SHong Zhang }
1949d013fc79Sandi selinger 
1950d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1951d71ae5a4SJacob Faibussowitsch {
19524222ddf1SHong Zhang   Mat_Product *product = C->product;
19534222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19544222ddf1SHong Zhang 
1955df97dc6dSFande Kong   PetscFunctionBegin;
19564222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19574222ddf1SHong Zhang     /* Alg: "outerproduct" */
19589566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
19594222ddf1SHong Zhang   } else {
19604222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19616718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19624222ddf1SHong Zhang 
196328b400f6SJacob Faibussowitsch     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
19641cdffd5eSHong Zhang     if (atb->At) {
19651cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
19661cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19671cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A, atb->At));
19687fb60732SBarry Smith       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
19694222ddf1SHong Zhang     }
19707fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
19714222ddf1SHong Zhang   }
19723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1973df97dc6dSFande Kong }
1974df97dc6dSFande Kong 
1975d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1976d71ae5a4SJacob Faibussowitsch {
19774222ddf1SHong Zhang   Mat_Product *product = C->product;
19784222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19794222ddf1SHong Zhang   PetscReal    fill = product->fill;
1980d013fc79Sandi selinger 
1981d013fc79Sandi selinger   PetscFunctionBegin;
19829566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
19832869b61bSandi selinger 
19844222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
19862869b61bSandi selinger }
1987d013fc79Sandi selinger 
1988d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1989d71ae5a4SJacob Faibussowitsch {
19904222ddf1SHong Zhang   Mat_Product *product = C->product;
19914222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm */
19924222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
19934222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19944222ddf1SHong Zhang   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
19954222ddf1SHong Zhang   PetscInt    nalg        = 7;
19964222ddf1SHong Zhang #else
19974222ddf1SHong Zhang   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
19984222ddf1SHong Zhang   PetscInt    nalg        = 8;
19994222ddf1SHong Zhang #endif
20004222ddf1SHong Zhang 
20014222ddf1SHong Zhang   PetscFunctionBegin;
20024222ddf1SHong Zhang   /* Set default algorithm */
20039566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
200448a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2005d013fc79Sandi selinger 
20064222ddf1SHong Zhang   /* Get runtime option */
20074222ddf1SHong Zhang   if (product->api_user) {
2008d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
20099566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2010d0609cedSBarry Smith     PetscOptionsEnd();
20114222ddf1SHong Zhang   } else {
2012d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
20139566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2014d0609cedSBarry Smith     PetscOptionsEnd();
2015d013fc79Sandi selinger   }
201648a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2017d013fc79Sandi selinger 
20184222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20194222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20214222ddf1SHong Zhang }
2022d013fc79Sandi selinger 
2023d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2024d71ae5a4SJacob Faibussowitsch {
20254222ddf1SHong Zhang   Mat_Product *product     = C->product;
20264222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20274222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
2028089a957eSStefano Zampini   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2029089a957eSStefano Zampini   PetscInt     nalg        = 3;
2030d013fc79Sandi selinger 
20314222ddf1SHong Zhang   PetscFunctionBegin;
20324222ddf1SHong Zhang   /* Get runtime option */
20334222ddf1SHong Zhang   if (product->api_user) {
2034d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
20359566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2036d0609cedSBarry Smith     PetscOptionsEnd();
20374222ddf1SHong Zhang   } else {
2038d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
20399566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2040d0609cedSBarry Smith     PetscOptionsEnd();
20414222ddf1SHong Zhang   }
204248a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2043d013fc79Sandi selinger 
20444222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20464222ddf1SHong Zhang }
20474222ddf1SHong Zhang 
2048d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2049d71ae5a4SJacob Faibussowitsch {
20504222ddf1SHong Zhang   Mat_Product *product     = C->product;
20514222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20524222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20534222ddf1SHong Zhang   const char  *algTypes[2] = {"default", "color"};
20544222ddf1SHong Zhang   PetscInt     nalg        = 2;
20554222ddf1SHong Zhang 
20564222ddf1SHong Zhang   PetscFunctionBegin;
20574222ddf1SHong Zhang   /* Set default algorithm */
20589566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
20594222ddf1SHong Zhang   if (!flg) {
20604222ddf1SHong Zhang     alg = 1;
20619566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20624222ddf1SHong Zhang   }
20634222ddf1SHong Zhang 
20644222ddf1SHong Zhang   /* Get runtime option */
20654222ddf1SHong Zhang   if (product->api_user) {
2066d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
20679566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2068d0609cedSBarry Smith     PetscOptionsEnd();
20694222ddf1SHong Zhang   } else {
2070d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
20719566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2072d0609cedSBarry Smith     PetscOptionsEnd();
20734222ddf1SHong Zhang   }
207448a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20754222ddf1SHong Zhang 
20764222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20774222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20794222ddf1SHong Zhang }
20804222ddf1SHong Zhang 
2081d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2082d71ae5a4SJacob Faibussowitsch {
20834222ddf1SHong Zhang   Mat_Product *product = C->product;
20844222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
20854222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
20864222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20874222ddf1SHong Zhang   const char *algTypes[2] = {"scalable", "rap"};
20884222ddf1SHong Zhang   PetscInt    nalg        = 2;
20894222ddf1SHong Zhang #else
20904222ddf1SHong Zhang   const char *algTypes[3] = {"scalable", "rap", "hypre"};
20914222ddf1SHong Zhang   PetscInt    nalg        = 3;
20924222ddf1SHong Zhang #endif
20934222ddf1SHong Zhang 
20944222ddf1SHong Zhang   PetscFunctionBegin;
20954222ddf1SHong Zhang   /* Set default algorithm */
20969566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
209748a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20984222ddf1SHong Zhang 
20994222ddf1SHong Zhang   /* Get runtime option */
21004222ddf1SHong Zhang   if (product->api_user) {
2101d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
21029566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2103d0609cedSBarry Smith     PetscOptionsEnd();
21044222ddf1SHong Zhang   } else {
2105d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
21069566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2107d0609cedSBarry Smith     PetscOptionsEnd();
21084222ddf1SHong Zhang   }
210948a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21104222ddf1SHong Zhang 
21114222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21134222ddf1SHong Zhang }
21144222ddf1SHong Zhang 
2115d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2116d71ae5a4SJacob Faibussowitsch {
21174222ddf1SHong Zhang   Mat_Product *product     = C->product;
21184222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21194222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21204222ddf1SHong Zhang   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
21214222ddf1SHong Zhang   PetscInt     nalg        = 3;
21224222ddf1SHong Zhang 
21234222ddf1SHong Zhang   PetscFunctionBegin;
21244222ddf1SHong Zhang   /* Set default algorithm */
21259566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
212648a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21274222ddf1SHong Zhang 
21284222ddf1SHong Zhang   /* Get runtime option */
21294222ddf1SHong Zhang   if (product->api_user) {
2130d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
21319566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2132d0609cedSBarry Smith     PetscOptionsEnd();
21334222ddf1SHong Zhang   } else {
2134d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
21359566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2136d0609cedSBarry Smith     PetscOptionsEnd();
21374222ddf1SHong Zhang   }
213848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21394222ddf1SHong Zhang 
21404222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21424222ddf1SHong Zhang }
21434222ddf1SHong Zhang 
21444222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2145d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2146d71ae5a4SJacob Faibussowitsch {
21474222ddf1SHong Zhang   Mat_Product *product     = C->product;
21484222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21494222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21504222ddf1SHong Zhang   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
21514222ddf1SHong Zhang   PetscInt     nalg        = 7;
21524222ddf1SHong Zhang 
21534222ddf1SHong Zhang   PetscFunctionBegin;
21544222ddf1SHong Zhang   /* Set default algorithm */
21559566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
215648a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21574222ddf1SHong Zhang 
21584222ddf1SHong Zhang   /* Get runtime option */
21594222ddf1SHong Zhang   if (product->api_user) {
2160d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
21619566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2162d0609cedSBarry Smith     PetscOptionsEnd();
21634222ddf1SHong Zhang   } else {
2164d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
21659566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2166d0609cedSBarry Smith     PetscOptionsEnd();
21674222ddf1SHong Zhang   }
216848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21694222ddf1SHong Zhang 
21704222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21714222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21734222ddf1SHong Zhang }
21744222ddf1SHong Zhang 
2175d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2176d71ae5a4SJacob Faibussowitsch {
21774222ddf1SHong Zhang   Mat_Product *product = C->product;
21784222ddf1SHong Zhang 
21794222ddf1SHong Zhang   PetscFunctionBegin;
21804222ddf1SHong Zhang   switch (product->type) {
2181d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
2182d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2183d71ae5a4SJacob Faibussowitsch     break;
2184d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
2185d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2186d71ae5a4SJacob Faibussowitsch     break;
2187d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
2188d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2189d71ae5a4SJacob Faibussowitsch     break;
2190d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_PtAP:
2191d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2192d71ae5a4SJacob Faibussowitsch     break;
2193d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_RARt:
2194d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2195d71ae5a4SJacob Faibussowitsch     break;
2196d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABC:
2197d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2198d71ae5a4SJacob Faibussowitsch     break;
2199d71ae5a4SJacob Faibussowitsch   default:
2200d71ae5a4SJacob Faibussowitsch     break;
22014222ddf1SHong Zhang   }
22023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2203d013fc79Sandi selinger }
2204