xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 2c71b3e237ead271e4f3aa1505f92bf476e3413d)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14df97dc6dSFande Kong {
15df97dc6dSFande Kong   PetscErrorCode ierr;
16df97dc6dSFande Kong 
17df97dc6dSFande Kong   PetscFunctionBegin;
18df97dc6dSFande Kong   if (C->ops->matmultnumeric) {
19*2c71b3e2SJacob Faibussowitsch     PetscCheckFalse(C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
20df97dc6dSFande Kong     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
21df97dc6dSFande Kong   } else {
22df97dc6dSFande Kong     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
23df97dc6dSFande Kong   }
24df97dc6dSFande Kong   PetscFunctionReturn(0);
25df97dc6dSFande Kong }
26df97dc6dSFande Kong 
274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
29df97dc6dSFande Kong {
30df97dc6dSFande Kong   PetscErrorCode ierr;
314222ddf1SHong Zhang   PetscInt       ii;
324222ddf1SHong Zhang   Mat_SeqAIJ     *aij;
33e4e71118SStefano Zampini   PetscBool      isseqaij;
345c66b693SKris Buschelman 
355c66b693SKris Buschelman   PetscFunctionBegin;
36*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(m > 0 && i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
374222ddf1SHong Zhang   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
384222ddf1SHong Zhang 
39e4e71118SStefano Zampini   if (!mtype) {
40e4e71118SStefano Zampini     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
41e4e71118SStefano Zampini     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
42e4e71118SStefano Zampini   } else {
43e4e71118SStefano Zampini     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
44e4e71118SStefano Zampini   }
45f4259b30SLisandro Dalcin   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
464222ddf1SHong Zhang   aij  = (Mat_SeqAIJ*)(mat)->data;
474222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
484222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
494222ddf1SHong Zhang 
504222ddf1SHong Zhang   aij->i            = i;
514222ddf1SHong Zhang   aij->j            = j;
524222ddf1SHong Zhang   aij->a            = a;
534222ddf1SHong Zhang   aij->singlemalloc = PETSC_FALSE;
544222ddf1SHong Zhang   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
554222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
564222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
574222ddf1SHong Zhang 
584222ddf1SHong Zhang   for (ii=0; ii<m; ii++) {
594222ddf1SHong Zhang     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
6025023028SHong Zhang   }
615c66b693SKris Buschelman   PetscFunctionReturn(0);
625c66b693SKris Buschelman }
631c24bd37SHong Zhang 
644222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
654222ddf1SHong Zhang {
664222ddf1SHong Zhang   PetscErrorCode      ierr;
674222ddf1SHong Zhang   Mat_Product         *product = C->product;
684222ddf1SHong Zhang   MatProductAlgorithm alg;
694222ddf1SHong Zhang   PetscBool           flg;
704222ddf1SHong Zhang 
714222ddf1SHong Zhang   PetscFunctionBegin;
724222ddf1SHong Zhang   if (product) {
734222ddf1SHong Zhang     alg = product->alg;
744222ddf1SHong Zhang   } else {
754222ddf1SHong Zhang     alg = "sorted";
764222ddf1SHong Zhang   }
774222ddf1SHong Zhang   /* sorted */
784222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
794222ddf1SHong Zhang   if (flg) {
804222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
814222ddf1SHong Zhang     PetscFunctionReturn(0);
824222ddf1SHong Zhang   }
834222ddf1SHong Zhang 
844222ddf1SHong Zhang   /* scalable */
854222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
864222ddf1SHong Zhang   if (flg) {
874222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
884222ddf1SHong Zhang     PetscFunctionReturn(0);
894222ddf1SHong Zhang   }
904222ddf1SHong Zhang 
914222ddf1SHong Zhang   /* scalable_fast */
924222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
934222ddf1SHong Zhang   if (flg) {
944222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
954222ddf1SHong Zhang     PetscFunctionReturn(0);
964222ddf1SHong Zhang   }
974222ddf1SHong Zhang 
984222ddf1SHong Zhang   /* heap */
994222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
1004222ddf1SHong Zhang   if (flg) {
1014222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
1024222ddf1SHong Zhang     PetscFunctionReturn(0);
1034222ddf1SHong Zhang   }
1044222ddf1SHong Zhang 
1054222ddf1SHong Zhang   /* btheap */
1064222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
1074222ddf1SHong Zhang   if (flg) {
1084222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
1094222ddf1SHong Zhang     PetscFunctionReturn(0);
1104222ddf1SHong Zhang   }
1114222ddf1SHong Zhang 
1124222ddf1SHong Zhang   /* llcondensed */
1134222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
1144222ddf1SHong Zhang   if (flg) {
1154222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
1164222ddf1SHong Zhang     PetscFunctionReturn(0);
1174222ddf1SHong Zhang   }
1184222ddf1SHong Zhang 
1194222ddf1SHong Zhang   /* rowmerge */
1204222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
1214222ddf1SHong Zhang   if (flg) {
1224222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
1234222ddf1SHong Zhang     PetscFunctionReturn(0);
1244222ddf1SHong Zhang   }
1254222ddf1SHong Zhang 
1264222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1274222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
1284222ddf1SHong Zhang   if (flg) {
1294222ddf1SHong Zhang     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
1304222ddf1SHong Zhang     PetscFunctionReturn(0);
1314222ddf1SHong Zhang   }
1324222ddf1SHong Zhang #endif
1334222ddf1SHong Zhang 
1344222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1354222ddf1SHong Zhang }
1364222ddf1SHong Zhang 
1374222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
138b561aa9dSHong Zhang {
139b561aa9dSHong Zhang   PetscErrorCode     ierr;
140b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
141971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
142c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
143b561aa9dSHong Zhang   PetscReal          afill;
144eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
145eca6b86aSHong Zhang   PetscTable         ta;
146fb908031SHong Zhang   PetscBT            lnkbt;
1470298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
148b561aa9dSHong Zhang 
149b561aa9dSHong Zhang   PetscFunctionBegin;
150bd958071SHong Zhang   /* Get ci and cj */
151bd958071SHong Zhang   /*---------------*/
152fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
153fb908031SHong Zhang   /* free space for accumulating nonzero column info */
154854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
155fb908031SHong Zhang   ci[0] = 0;
156fb908031SHong Zhang 
157fb908031SHong Zhang   /* create and initialize a linked list */
158c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
159c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
160eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
161eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
162eca6b86aSHong Zhang 
163eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
164fb908031SHong Zhang 
165fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
166f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1672205254eSKarl Rupp 
168fb908031SHong Zhang   current_space = free_space;
169fb908031SHong Zhang 
170fb908031SHong Zhang   /* Determine ci and cj */
171fb908031SHong Zhang   for (i=0; i<am; i++) {
172fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
173fb908031SHong Zhang     aj   = a->j + ai[i];
174fb908031SHong Zhang     for (j=0; j<anzi; j++) {
175fb908031SHong Zhang       brow = aj[j];
176fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
177fb908031SHong Zhang       bj   = b->j + bi[brow];
178fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
179fb908031SHong Zhang       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
180fb908031SHong Zhang     }
1818c78258cSHong Zhang     /* add possible missing diagonal entry */
1828c78258cSHong Zhang     if (C->force_diagonals) {
1838c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr);
1848c78258cSHong Zhang     }
185fb908031SHong Zhang     cnzi = lnk[0];
186fb908031SHong Zhang 
187fb908031SHong Zhang     /* If free space is not available, make more free space */
188fb908031SHong Zhang     /* Double the amount of total space in the list */
189fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
190f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
191fb908031SHong Zhang       ndouble++;
192fb908031SHong Zhang     }
193fb908031SHong Zhang 
194fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
195fb908031SHong Zhang     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1962205254eSKarl Rupp 
197fb908031SHong Zhang     current_space->array           += cnzi;
198fb908031SHong Zhang     current_space->local_used      += cnzi;
199fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2002205254eSKarl Rupp 
201fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
202fb908031SHong Zhang   }
203fb908031SHong Zhang 
204fb908031SHong Zhang   /* Column indices are in the list of free space */
205fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
206fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
207854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
208fb908031SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
209fb908031SHong Zhang   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
210b561aa9dSHong Zhang 
21126be0446SHong Zhang   /* put together the new symbolic matrix */
212e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
2134222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
21458c24d83SHong Zhang 
21558c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
21658c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2174222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
218aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
219e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22058c24d83SHong Zhang   c->nonew   = 0;
2214222ddf1SHong Zhang 
2224222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2234222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2240b7e3e3dSHong Zhang 
2257212da7cSHong Zhang   /* set MatInfo */
2267212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
227f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2287212da7cSHong Zhang   c->maxnz                  = ci[am];
2297212da7cSHong Zhang   c->nz                     = ci[am];
2304222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2314222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2324222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2337212da7cSHong Zhang 
2347212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2357212da7cSHong Zhang   if (ci[am]) {
2367d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
2377d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
238f2b054eeSHong Zhang   } else {
2394222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
240be0fcf8dSHong Zhang   }
241f2b054eeSHong Zhang #endif
24258c24d83SHong Zhang   PetscFunctionReturn(0);
24358c24d83SHong Zhang }
244d50806bdSBarry Smith 
245df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
246d50806bdSBarry Smith {
247dfbe8321SBarry Smith   PetscErrorCode    ierr;
248d13dce4bSSatish Balay   PetscLogDouble    flops=0.0;
249d50806bdSBarry Smith   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
250d50806bdSBarry Smith   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
251d50806bdSBarry Smith   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
25238baddfdSBarry Smith   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
253c58ca2e3SHong Zhang   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
254519eb980SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
2552e5835c6SStefano Zampini   PetscScalar       *ca,valtmp;
256aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2576718818eSStefano Zampini   PetscContainer    cab_dense;
2582e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
259d50806bdSBarry Smith 
260d50806bdSBarry Smith   PetscFunctionBegin;
2612e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr);
2622e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr);
263aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
264854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
265aa1aec99SHong Zhang     c->a      = ca;
266aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2676718818eSStefano Zampini   } else ca = c->a;
2686718818eSStefano Zampini 
2696718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2706718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2716718818eSStefano Zampini      that is hard to eradicate) */
2726718818eSStefano Zampini   ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr);
2736718818eSStefano Zampini   if (!cab_dense) {
2746718818eSStefano Zampini     ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
2756718818eSStefano Zampini     ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr);
2766718818eSStefano Zampini     ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr);
2776718818eSStefano Zampini     ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr);
2786718818eSStefano Zampini     ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr);
2796718818eSStefano Zampini     ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr);
280d90d86d0SMatthew G. Knepley   }
2816718818eSStefano Zampini   ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr);
2826718818eSStefano Zampini   ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr);
283aa1aec99SHong Zhang 
284c124e916SHong Zhang   /* clean old values in C */
285580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
286d50806bdSBarry Smith   /* Traverse A row-wise. */
287d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
288d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
289d50806bdSBarry Smith   for (i=0; i<am; i++) {
290d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
291d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
292519eb980SHong Zhang       brow = aj[j];
293d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
294d50806bdSBarry Smith       bjj  = bj + bi[brow];
295d50806bdSBarry Smith       baj  = ba + bi[brow];
296519eb980SHong Zhang       /* perform dense axpy */
29736ec6d2dSHong Zhang       valtmp = aa[j];
298519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
29936ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
300519eb980SHong Zhang       }
301519eb980SHong Zhang       flops += 2*bnzi;
302519eb980SHong Zhang     }
303c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
304c58ca2e3SHong Zhang 
305c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
306519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
307519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
308519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
309519eb980SHong Zhang     }
310519eb980SHong Zhang     flops += cnzi;
311519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
312519eb980SHong Zhang   }
3132e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3142e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3152e5835c6SStefano Zampini #endif
316c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
317c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
318c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
3192e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr);
3202e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr);
321c58ca2e3SHong Zhang   PetscFunctionReturn(0);
322c58ca2e3SHong Zhang }
323c58ca2e3SHong Zhang 
32425023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
325c58ca2e3SHong Zhang {
326c58ca2e3SHong Zhang   PetscErrorCode    ierr;
327c58ca2e3SHong Zhang   PetscLogDouble    flops=0.0;
328c58ca2e3SHong Zhang   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
329c58ca2e3SHong Zhang   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
330c58ca2e3SHong Zhang   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
331c58ca2e3SHong Zhang   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
332c58ca2e3SHong Zhang   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
333c58ca2e3SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
3342e5835c6SStefano Zampini   PetscScalar       *ca=c->a,valtmp;
3352e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
336c58ca2e3SHong Zhang   PetscInt          nextb;
337c58ca2e3SHong Zhang 
338c58ca2e3SHong Zhang   PetscFunctionBegin;
3392e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr);
3402e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr);
341cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
342cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
343cf742fccSHong Zhang     c->a      = ca;
344cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
345cf742fccSHong Zhang   }
346cf742fccSHong Zhang 
347c58ca2e3SHong Zhang   /* clean old values in C */
348580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
349c58ca2e3SHong Zhang   /* Traverse A row-wise. */
350c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
351c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
352519eb980SHong Zhang   for (i=0; i<am; i++) {
353519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
354519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
355519eb980SHong Zhang     for (j=0; j<anzi; j++) {
356519eb980SHong Zhang       brow = aj[j];
357519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
358519eb980SHong Zhang       bjj  = bj + bi[brow];
359519eb980SHong Zhang       baj  = ba + bi[brow];
360519eb980SHong Zhang       /* perform sparse axpy */
36136ec6d2dSHong Zhang       valtmp = aa[j];
362c124e916SHong Zhang       nextb  = 0;
363c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
364c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
36536ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
366c124e916SHong Zhang         }
367d50806bdSBarry Smith       }
368d50806bdSBarry Smith       flops += 2*bnzi;
369d50806bdSBarry Smith     }
370519eb980SHong Zhang     aj += anzi; aa += anzi;
371519eb980SHong Zhang     cj += cnzi; ca += cnzi;
372d50806bdSBarry Smith   }
3732e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3742e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3752e5835c6SStefano Zampini #endif
376716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
377716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
378d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
3792e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr);
3802e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr);
381d50806bdSBarry Smith   PetscFunctionReturn(0);
382d50806bdSBarry Smith }
383bc011b1eSHong Zhang 
3844222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
38525296bd5SBarry Smith {
38625296bd5SBarry Smith   PetscErrorCode     ierr;
38725296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
38825296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3893c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
39025296bd5SBarry Smith   MatScalar          *ca;
39125296bd5SBarry Smith   PetscReal          afill;
392eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
393eca6b86aSHong Zhang   PetscTable         ta;
3940298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
39525296bd5SBarry Smith 
39625296bd5SBarry Smith   PetscFunctionBegin;
3973c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3983c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3993c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
400854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
4013c50cad2SHong Zhang   ci[0] = 0;
4023c50cad2SHong Zhang 
4033c50cad2SHong Zhang   /* create and initialize a linked list */
404c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
405c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
406eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
407eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
408eca6b86aSHong Zhang 
409eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
4103c50cad2SHong Zhang 
4113c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
412f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
4133c50cad2SHong Zhang   current_space = free_space;
4143c50cad2SHong Zhang 
4153c50cad2SHong Zhang   /* Determine ci and cj */
4163c50cad2SHong Zhang   for (i=0; i<am; i++) {
4173c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
4183c50cad2SHong Zhang     aj   = a->j + ai[i];
4193c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
4203c50cad2SHong Zhang       brow = aj[j];
4213c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
4223c50cad2SHong Zhang       bj   = b->j + bi[brow];
4233c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4243c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
4253c50cad2SHong Zhang     }
4268c78258cSHong Zhang     /* add possible missing diagonal entry */
4278c78258cSHong Zhang     if (C->force_diagonals) {
4288c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr);
4298c78258cSHong Zhang     }
4303c50cad2SHong Zhang     cnzi = lnk[1];
4313c50cad2SHong Zhang 
4323c50cad2SHong Zhang     /* If free space is not available, make more free space */
4333c50cad2SHong Zhang     /* Double the amount of total space in the list */
4343c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
435f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
4363c50cad2SHong Zhang       ndouble++;
4373c50cad2SHong Zhang     }
4383c50cad2SHong Zhang 
4393c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4403c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4412205254eSKarl Rupp 
4423c50cad2SHong Zhang     current_space->array           += cnzi;
4433c50cad2SHong Zhang     current_space->local_used      += cnzi;
4443c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4452205254eSKarl Rupp 
4463c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
4473c50cad2SHong Zhang   }
4483c50cad2SHong Zhang 
4493c50cad2SHong Zhang   /* Column indices are in the list of free space */
4503c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4513c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
452854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
4533c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
4543c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
45525296bd5SBarry Smith 
45625296bd5SBarry Smith   /* Allocate space for ca */
457580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
45825296bd5SBarry Smith 
45925296bd5SBarry Smith   /* put together the new symbolic matrix */
460e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
4614222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
46225296bd5SBarry Smith 
46325296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
46425296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4654222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
46625296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
46725296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
46825296bd5SBarry Smith   c->nonew   = 0;
4692205254eSKarl Rupp 
4704222ddf1SHong Zhang   /* slower, less memory */
4714222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47225296bd5SBarry Smith 
47325296bd5SBarry Smith   /* set MatInfo */
47425296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
47525296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
47625296bd5SBarry Smith   c->maxnz                     = ci[am];
47725296bd5SBarry Smith   c->nz                        = ci[am];
4784222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4794222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4804222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
48125296bd5SBarry Smith 
48225296bd5SBarry Smith #if defined(PETSC_USE_INFO)
48325296bd5SBarry Smith   if (ci[am]) {
4847d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
4857d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
48625296bd5SBarry Smith   } else {
4874222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
48825296bd5SBarry Smith   }
48925296bd5SBarry Smith #endif
49025296bd5SBarry Smith   PetscFunctionReturn(0);
49125296bd5SBarry Smith }
49225296bd5SBarry Smith 
4934222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
494e9e4536cSHong Zhang {
495e9e4536cSHong Zhang   PetscErrorCode     ierr;
496e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
497bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
49825c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
499e9e4536cSHong Zhang   MatScalar          *ca;
500e9e4536cSHong Zhang   PetscReal          afill;
501eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
502eca6b86aSHong Zhang   PetscTable         ta;
5030298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
504e9e4536cSHong Zhang 
505e9e4536cSHong Zhang   PetscFunctionBegin;
506bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
507bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
508bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
509854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
510bd958071SHong Zhang   ci[0] = 0;
511bd958071SHong Zhang 
512bd958071SHong Zhang   /* create and initialize a linked list */
513c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
514c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
515eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
516eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
517eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
518bd958071SHong Zhang 
519bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
520f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
521bd958071SHong Zhang   current_space = free_space;
522bd958071SHong Zhang 
523bd958071SHong Zhang   /* Determine ci and cj */
524bd958071SHong Zhang   for (i=0; i<am; i++) {
525bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
526bd958071SHong Zhang     aj   = a->j + ai[i];
527bd958071SHong Zhang     for (j=0; j<anzi; j++) {
528bd958071SHong Zhang       brow = aj[j];
529bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
530bd958071SHong Zhang       bj   = b->j + bi[brow];
531bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
532bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
533bd958071SHong Zhang     }
5348c78258cSHong Zhang     /* add possible missing diagonal entry */
5358c78258cSHong Zhang     if (C->force_diagonals) {
5368c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr);
5378c78258cSHong Zhang     }
5388c78258cSHong Zhang 
539bd958071SHong Zhang     cnzi = lnk[0];
540bd958071SHong Zhang 
541bd958071SHong Zhang     /* If free space is not available, make more free space */
542bd958071SHong Zhang     /* Double the amount of total space in the list */
543bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
544f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
545bd958071SHong Zhang       ndouble++;
546bd958071SHong Zhang     }
547bd958071SHong Zhang 
548bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
549bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
5502205254eSKarl Rupp 
551bd958071SHong Zhang     current_space->array           += cnzi;
552bd958071SHong Zhang     current_space->local_used      += cnzi;
553bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5542205254eSKarl Rupp 
555bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
556bd958071SHong Zhang   }
557bd958071SHong Zhang 
558bd958071SHong Zhang   /* Column indices are in the list of free space */
559bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
560bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
561854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
562bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
563bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
564e9e4536cSHong Zhang 
565e9e4536cSHong Zhang   /* Allocate space for ca */
566bd958071SHong Zhang   /*-----------------------*/
567580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
568e9e4536cSHong Zhang 
569e9e4536cSHong Zhang   /* put together the new symbolic matrix */
570e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
5714222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
572e9e4536cSHong Zhang 
573e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
574e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5754222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
576e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
577e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
578e9e4536cSHong Zhang   c->nonew   = 0;
5792205254eSKarl Rupp 
5804222ddf1SHong Zhang   /* slower, less memory */
5814222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
582e9e4536cSHong Zhang 
583e9e4536cSHong Zhang   /* set MatInfo */
584e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
585e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
586e9e4536cSHong Zhang   c->maxnz                     = ci[am];
587e9e4536cSHong Zhang   c->nz                        = ci[am];
5884222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5894222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5904222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
591e9e4536cSHong Zhang 
592e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
593e9e4536cSHong Zhang   if (ci[am]) {
5947d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
5957d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
596e9e4536cSHong Zhang   } else {
5974222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
598e9e4536cSHong Zhang   }
599e9e4536cSHong Zhang #endif
600e9e4536cSHong Zhang   PetscFunctionReturn(0);
601e9e4536cSHong Zhang }
602e9e4536cSHong Zhang 
6034222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
6040ced3a2bSJed Brown {
6050ced3a2bSJed Brown   PetscErrorCode     ierr;
6060ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6070ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6080ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
6090ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6100ced3a2bSJed Brown   PetscReal          afill;
6110ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
6120298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6130ced3a2bSJed Brown   PetscHeap          h;
6140ced3a2bSJed Brown 
6150ced3a2bSJed Brown   PetscFunctionBegin;
616cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6170ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
6180ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
619854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6200ced3a2bSJed Brown   ci[0] = 0;
6210ced3a2bSJed Brown 
6220ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
623f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6240ced3a2bSJed Brown   current_space = free_space;
6250ced3a2bSJed Brown 
6260ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
627785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6280ced3a2bSJed Brown 
6290ced3a2bSJed Brown   /* Determine ci and cj */
6300ced3a2bSJed Brown   for (i=0; i<am; i++) {
6310ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6320ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6330ced3a2bSJed Brown     ci[i+1] = ci[i];
6340ced3a2bSJed Brown     /* Populate the min heap */
6350ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
6360ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
6370ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
6380ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
6390ced3a2bSJed Brown       }
6400ced3a2bSJed Brown     }
6410ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6420ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6430ced3a2bSJed Brown     while (j >= 0) {
6440ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
645f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6460ced3a2bSJed Brown         ndouble++;
6470ced3a2bSJed Brown       }
6480ced3a2bSJed Brown       *(current_space->array++) = col;
6490ced3a2bSJed Brown       current_space->local_used++;
6500ced3a2bSJed Brown       current_space->local_remaining--;
6510ced3a2bSJed Brown       ci[i+1]++;
6520ced3a2bSJed Brown 
6530ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6540ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
6550ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
6560ced3a2bSJed Brown         PetscInt j2,col2;
6570ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
6580ced3a2bSJed Brown         if (col2 != col) break;
6590ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
6600ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
6610ced3a2bSJed Brown       }
6620ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6630ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
6640ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6650ced3a2bSJed Brown     }
6660ced3a2bSJed Brown   }
6670ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6680ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
6690ced3a2bSJed Brown 
6700ced3a2bSJed Brown   /* Column indices are in the list of free space */
6710ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6720ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
673785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
6740ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
6750ced3a2bSJed Brown 
6760ced3a2bSJed Brown   /* put together the new symbolic matrix */
677e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
6784222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
6790ced3a2bSJed Brown 
6800ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6810ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6824222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
6830ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6840ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6850ced3a2bSJed Brown   c->nonew   = 0;
68626fbe8dcSKarl Rupp 
6874222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6880ced3a2bSJed Brown 
6890ced3a2bSJed Brown   /* set MatInfo */
6900ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6910ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6920ced3a2bSJed Brown   c->maxnz                     = ci[am];
6930ced3a2bSJed Brown   c->nz                        = ci[am];
6944222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6954222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6964222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6970ced3a2bSJed Brown 
6980ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6990ced3a2bSJed Brown   if (ci[am]) {
7007d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
7017d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7020ced3a2bSJed Brown   } else {
7034222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
7040ced3a2bSJed Brown   }
7050ced3a2bSJed Brown #endif
7060ced3a2bSJed Brown   PetscFunctionReturn(0);
7070ced3a2bSJed Brown }
708e9e4536cSHong Zhang 
7094222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
7108a07c6f1SJed Brown {
7118a07c6f1SJed Brown   PetscErrorCode     ierr;
7128a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
7138a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
7148a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
7158a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
7168a07c6f1SJed Brown   PetscReal          afill;
7178a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
7180298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
7198a07c6f1SJed Brown   PetscHeap          h;
7208a07c6f1SJed Brown   PetscBT            bt;
7218a07c6f1SJed Brown 
7228a07c6f1SJed Brown   PetscFunctionBegin;
723cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7248a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
7258a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
726854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
7278a07c6f1SJed Brown   ci[0] = 0;
7288a07c6f1SJed Brown 
7298a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
730f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
7312205254eSKarl Rupp 
7328a07c6f1SJed Brown   current_space = free_space;
7338a07c6f1SJed Brown 
7348a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
735785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
7368a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
7378a07c6f1SJed Brown 
7388a07c6f1SJed Brown   /* Determine ci and cj */
7398a07c6f1SJed Brown   for (i=0; i<am; i++) {
7408a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7418a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
7428a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7438a07c6f1SJed Brown     ci[i+1] = ci[i];
7448a07c6f1SJed Brown     /* Populate the min heap */
7458a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
7468a07c6f1SJed Brown       PetscInt brow = acol[j];
7478a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
7488a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7498a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7508a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7518a07c6f1SJed Brown           bb[j]++;
7528a07c6f1SJed Brown           break;
7538a07c6f1SJed Brown         }
7548a07c6f1SJed Brown       }
7558a07c6f1SJed Brown     }
7568a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7578a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7588a07c6f1SJed Brown     while (j >= 0) {
7598a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7600298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
761f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
7628a07c6f1SJed Brown         ndouble++;
7638a07c6f1SJed Brown       }
7648a07c6f1SJed Brown       *(current_space->array++) = col;
7658a07c6f1SJed Brown       current_space->local_used++;
7668a07c6f1SJed Brown       current_space->local_remaining--;
7678a07c6f1SJed Brown       ci[i+1]++;
7688a07c6f1SJed Brown 
7698a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7708a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
7718a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7728a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7738a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7748a07c6f1SJed Brown           bb[j]++;
7758a07c6f1SJed Brown           break;
7768a07c6f1SJed Brown         }
7778a07c6f1SJed Brown       }
7788a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7798a07c6f1SJed Brown     }
7808a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7818a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7828a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7838a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7848a07c6f1SJed Brown     }
7858a07c6f1SJed Brown   }
7868a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
7878a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7888a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7898a07c6f1SJed Brown 
7908a07c6f1SJed Brown   /* Column indices are in the list of free space */
7918a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7928a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
793785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7948a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7958a07c6f1SJed Brown 
7968a07c6f1SJed Brown   /* put together the new symbolic matrix */
797e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
7984222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
7998a07c6f1SJed Brown 
8008a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
8018a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
8024222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
8038a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
8048a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
8058a07c6f1SJed Brown   c->nonew   = 0;
80626fbe8dcSKarl Rupp 
8074222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
8088a07c6f1SJed Brown 
8098a07c6f1SJed Brown   /* set MatInfo */
8108a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
8118a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
8128a07c6f1SJed Brown   c->maxnz                     = ci[am];
8138a07c6f1SJed Brown   c->nz                        = ci[am];
8144222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8154222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8164222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8178a07c6f1SJed Brown 
8188a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8198a07c6f1SJed Brown   if (ci[am]) {
8207d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
8217d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
8228a07c6f1SJed Brown   } else {
8234222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
8248a07c6f1SJed Brown   }
8258a07c6f1SJed Brown #endif
8268a07c6f1SJed Brown   PetscFunctionReturn(0);
8278a07c6f1SJed Brown }
8288a07c6f1SJed Brown 
8294222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
830d7ed1a4dSandi selinger {
831d7ed1a4dSandi selinger   PetscErrorCode     ierr;
832d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
833d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
834d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
835d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
836d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
837d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
838d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
839d7ed1a4dSandi selinger   PetscInt           window[8];
840d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
841d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
842d7ed1a4dSandi selinger   PetscReal          afill;
843f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
8447660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
845d7ed1a4dSandi selinger 
846d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
847d7ed1a4dSandi selinger              Because of the way virtual memory works,
848d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
849d7ed1a4dSandi selinger   PetscFunctionBegin;
850d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
851d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
852d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
853d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
854d7ed1a4dSandi selinger     a_rownnz = 0;
855d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
856d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
857d7ed1a4dSandi selinger       if (a_rownnz > bn) {
858d7ed1a4dSandi selinger         a_rownnz = bn;
859d7ed1a4dSandi selinger         break;
860d7ed1a4dSandi selinger       }
861d7ed1a4dSandi selinger     }
862d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
863d7ed1a4dSandi selinger   }
864d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
865d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
866f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
867f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
868d7ed1a4dSandi selinger 
8697660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8707660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
871d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
872d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
873d7ed1a4dSandi selinger 
874d7ed1a4dSandi selinger   ci_nnz       = 0;
875d7ed1a4dSandi selinger   ci[0]        = 0;
876d7ed1a4dSandi selinger   worki_L1[0]  = 0;
877d7ed1a4dSandi selinger   worki_L2[0]  = 0;
878d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
879d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
880d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
881d7ed1a4dSandi selinger     rowsleft             = anzi;
882d7ed1a4dSandi selinger     inputcol_L1          = acol;
883d7ed1a4dSandi selinger     L2_nnz               = 0;
8847660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8857660c4dbSandi selinger     worki_L2[1]          = 0;
88608fe1b3cSKarl Rupp     outputi_nnz          = 0;
887d7ed1a4dSandi selinger 
888d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
889d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
890d7ed1a4dSandi selinger       c_maxmem *= 2;
891d7ed1a4dSandi selinger       ndouble++;
892d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
893d7ed1a4dSandi selinger     }
894d7ed1a4dSandi selinger 
895d7ed1a4dSandi selinger     while (rowsleft) {
896d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
897d7ed1a4dSandi selinger       L1_nrows    = 0;
8987660c4dbSandi selinger       L1_nnz      = 0;
899d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
900d7ed1a4dSandi selinger       inputi      = bi;
901d7ed1a4dSandi selinger       inputj      = bj;
902d7ed1a4dSandi selinger 
903d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
904d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
905f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
906d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
907d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
908d7ed1a4dSandi selinger          window_min  = bn;                                                   \
9097660c4dbSandi selinger          outputi_nnz = 0;                                                    \
9107660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
911d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
912d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
913d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
914d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
915d7ed1a4dSandi selinger          }                                                                   \
916d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
917d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
918d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
919d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
920d7ed1a4dSandi selinger            window_min = bn;                                                  \
921d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
922d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
923d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
924d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
925d7ed1a4dSandi selinger              }                                                               \
926d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
927d7ed1a4dSandi selinger            }                                                                 \
928d7ed1a4dSandi selinger          }
929d7ed1a4dSandi selinger 
930d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
931d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
932d7ed1a4dSandi selinger       while (L1_rowsleft) {
9337660c4dbSandi selinger         outputi_nnz = 0;
9347660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
9357660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
9367660c4dbSandi selinger 
937d7ed1a4dSandi selinger         switch (L1_rowsleft) {
938d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
939d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
940d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
941d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
942d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
943d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
944d7ed1a4dSandi selinger                  break;
945d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
946d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
947d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
948d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
949d7ed1a4dSandi selinger                  break;
950d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
951d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
952d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
953d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
954d7ed1a4dSandi selinger                  break;
955d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
956d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
957d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
958d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
959d7ed1a4dSandi selinger                  break;
960d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
961d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
962d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
963d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
964d7ed1a4dSandi selinger                  break;
965d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
966d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
967d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
968d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
969d7ed1a4dSandi selinger                  break;
970d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
971d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
972d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
973d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
974d7ed1a4dSandi selinger                  break;
975d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
976d7ed1a4dSandi selinger                  inputcol    += 8;
977d7ed1a4dSandi selinger                  rowsleft    -= 8;
978d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
979d7ed1a4dSandi selinger                  break;
980d7ed1a4dSandi selinger         }
981d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9827660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9837660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
984d7ed1a4dSandi selinger       }
985d7ed1a4dSandi selinger 
986d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
987d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
988d7ed1a4dSandi selinger       if (anzi > 8) {
989d7ed1a4dSandi selinger         inputi      = worki_L1;
990d7ed1a4dSandi selinger         inputj      = workj_L1;
991d7ed1a4dSandi selinger         inputcol    = workcol;
992d7ed1a4dSandi selinger         outputi_nnz = 0;
993d7ed1a4dSandi selinger 
994d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
995d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
996d7ed1a4dSandi selinger 
997d7ed1a4dSandi selinger         switch (L1_nrows) {
998d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
999d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
1000d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1001d7ed1a4dSandi selinger                  break;
1002d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1003d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1004d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1005d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1006d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1007d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1008d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1009d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1010d7ed1a4dSandi selinger         }
1011d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
1012d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
1013d7ed1a4dSandi selinger 
10147660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10157660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1016d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1017d7ed1a4dSandi selinger           inputi      = worki_L2;
1018d7ed1a4dSandi selinger           inputj      = workj_L2;
1019d7ed1a4dSandi selinger           inputcol    = workcol;
1020d7ed1a4dSandi selinger           outputi_nnz = 0;
1021f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1022d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
1023d7ed1a4dSandi selinger           switch (L2_nrows) {
1024d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1025d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1026d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1027d7ed1a4dSandi selinger                    break;
1028d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1029d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1030d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1031d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1032d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1033d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1034d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1035d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1036d7ed1a4dSandi selinger           }
1037d7ed1a4dSandi selinger           L2_nrows    = 1;
10387660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10397660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10407660c4dbSandi selinger           /* Copy to workj_L2 */
1041d7ed1a4dSandi selinger           if (rowsleft) {
10427660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1043d7ed1a4dSandi selinger           }
1044d7ed1a4dSandi selinger         }
1045d7ed1a4dSandi selinger       }
1046d7ed1a4dSandi selinger     }  /* while (rowsleft) */
1047d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1048d7ed1a4dSandi selinger 
1049d7ed1a4dSandi selinger     /* terminate current row */
1050d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1051d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
1052d7ed1a4dSandi selinger   }
1053d7ed1a4dSandi selinger 
1054d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
1055e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
10564222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1057d7ed1a4dSandi selinger 
1058d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1059d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10604222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1061d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1062d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1063d7ed1a4dSandi selinger   c->nonew   = 0;
1064d7ed1a4dSandi selinger 
10654222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1066d7ed1a4dSandi selinger 
1067d7ed1a4dSandi selinger   /* set MatInfo */
1068d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1069d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
1070d7ed1a4dSandi selinger   c->maxnz                  = ci[am];
1071d7ed1a4dSandi selinger   c->nz                     = ci[am];
10724222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10734222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10744222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1075d7ed1a4dSandi selinger 
1076d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1077d7ed1a4dSandi selinger   if (ci[am]) {
10787d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
10797d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1080d7ed1a4dSandi selinger   } else {
10814222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1082d7ed1a4dSandi selinger   }
1083d7ed1a4dSandi selinger #endif
1084d7ed1a4dSandi selinger 
1085d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1086d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1087d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1088f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1089d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1090d7ed1a4dSandi selinger }
1091d7ed1a4dSandi selinger 
1092cd093f1dSJed Brown /* concatenate unique entries and then sort */
10934222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1094cd093f1dSJed Brown {
1095cd093f1dSJed Brown   PetscErrorCode ierr;
1096cd093f1dSJed Brown   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1097cd093f1dSJed Brown   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
10988c78258cSHong Zhang   PetscInt       *ci,*cj,bcol;
1099cd093f1dSJed Brown   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1100cd093f1dSJed Brown   PetscReal      afill;
1101cd093f1dSJed Brown   PetscInt       i,j,ndouble = 0;
1102cd093f1dSJed Brown   PetscSegBuffer seg,segrow;
1103cd093f1dSJed Brown   char           *seen;
1104cd093f1dSJed Brown 
1105cd093f1dSJed Brown   PetscFunctionBegin;
1106854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1107cd093f1dSJed Brown   ci[0] = 0;
1108cd093f1dSJed Brown 
1109cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1110cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1111cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1112580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1113cd093f1dSJed Brown 
1114cd093f1dSJed Brown   /* Determine ci and cj */
1115cd093f1dSJed Brown   for (i=0; i<am; i++) {
1116cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1117cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1118cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
11198c78258cSHong Zhang 
1120cd093f1dSJed Brown     /* Pack segrow */
1121cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1122cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1123cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
11248c78258cSHong Zhang         bcol = bj[k];
1125cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1126cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1127cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1128cd093f1dSJed Brown           *slot = bcol;
1129cd093f1dSJed Brown           seen[bcol] = 1;
1130cd093f1dSJed Brown           packlen++;
1131cd093f1dSJed Brown         }
1132cd093f1dSJed Brown       }
1133cd093f1dSJed Brown     }
11348c78258cSHong Zhang 
11358c78258cSHong Zhang     /* Check i-th diagonal entry */
11368c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11378c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11388c78258cSHong Zhang       ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
11398c78258cSHong Zhang       *slot   = i;
11408c78258cSHong Zhang       seen[i] = 1;
11418c78258cSHong Zhang       packlen++;
11428c78258cSHong Zhang     }
11438c78258cSHong Zhang 
1144cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1145cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1146cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1147cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1148cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1149cd093f1dSJed Brown   }
1150cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1151cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1152cd093f1dSJed Brown 
1153cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1154cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1155cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1156cd093f1dSJed Brown 
1157cd093f1dSJed Brown   /* put together the new symbolic matrix */
1158e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
11594222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1160cd093f1dSJed Brown 
1161cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1162cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11634222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1164cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1165cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1166cd093f1dSJed Brown   c->nonew   = 0;
1167cd093f1dSJed Brown 
11684222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1169cd093f1dSJed Brown 
1170cd093f1dSJed Brown   /* set MatInfo */
11712a09556fSStefano Zampini   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1172cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1173cd093f1dSJed Brown   c->maxnz                  = ci[am];
1174cd093f1dSJed Brown   c->nz                     = ci[am];
11754222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11764222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11774222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1178cd093f1dSJed Brown 
1179cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1180cd093f1dSJed Brown   if (ci[am]) {
11817d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
11827d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1183cd093f1dSJed Brown   } else {
11844222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1185cd093f1dSJed Brown   }
1186cd093f1dSJed Brown #endif
1187cd093f1dSJed Brown   PetscFunctionReturn(0);
1188cd093f1dSJed Brown }
1189cd093f1dSJed Brown 
11906718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
11912128a86cSHong Zhang {
11922128a86cSHong Zhang   PetscErrorCode      ierr;
11936718818eSStefano Zampini   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
11942128a86cSHong Zhang 
11952128a86cSHong Zhang   PetscFunctionBegin;
119640192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
119740192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
119840192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
119940192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
12002128a86cSHong Zhang   PetscFunctionReturn(0);
12012128a86cSHong Zhang }
12022128a86cSHong Zhang 
12034222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1204bc011b1eSHong Zhang {
12055df89d91SHong Zhang   PetscErrorCode      ierr;
120681d82fe4SHong Zhang   Mat                 Bt;
120781d82fe4SHong Zhang   PetscInt            *bti,*btj;
120840192850SHong Zhang   Mat_MatMatTransMult *abt;
12094222ddf1SHong Zhang   Mat_Product         *product = C->product;
12106718818eSStefano Zampini   char                *alg;
1211d2853540SHong Zhang 
121281d82fe4SHong Zhang   PetscFunctionBegin;
1213*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1214*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
12156718818eSStefano Zampini 
121681d82fe4SHong Zhang   /* create symbolic Bt */
121781d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12180298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
121933d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
122004b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
122181d82fe4SHong Zhang 
122281d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12236718818eSStefano Zampini   ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr);
12244222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
122581d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
12264222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
12276718818eSStefano Zampini   ierr = PetscFree(alg);CHKERRQ(ierr);
122881d82fe4SHong Zhang 
1229a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1230b00a9115SJed Brown   ierr = PetscNew(&abt);CHKERRQ(ierr);
12312128a86cSHong Zhang 
12326718818eSStefano Zampini   product->data    = abt;
12336718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12346718818eSStefano Zampini 
12354222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12362128a86cSHong Zhang 
12374222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12384222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
123940192850SHong Zhang   if (abt->usecoloring) {
1240b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1241b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1242335efc43SPeter Brune     MatColoring          coloring;
12432128a86cSHong Zhang     ISColoring           iscoloring;
12442128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
1245e8354b3eSHong Zhang 
12464222ddf1SHong Zhang     /* inode causes memory problem */
12474222ddf1SHong Zhang     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
12484222ddf1SHong Zhang 
12494222ddf1SHong Zhang     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1250335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1251335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1252335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1253335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1254335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
12554222ddf1SHong Zhang     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
12562205254eSKarl Rupp 
125740192850SHong Zhang     abt->matcoloring = matcoloring;
12582205254eSKarl Rupp 
12592128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
12602128a86cSHong Zhang 
12612128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12622128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
12632128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12642128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
12650298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
12662205254eSKarl Rupp 
12672128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
126840192850SHong Zhang     abt->Bt_den         = Bt_dense;
12692128a86cSHong Zhang 
12702128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
12712128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12722128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
12730298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
12742205254eSKarl Rupp 
12752128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
127640192850SHong Zhang     abt->ABt_den  = C_dense;
1277f94ccd6cSHong Zhang 
1278f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12791ce71dffSSatish Balay     {
12804222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
12817d3de750SJacob Faibussowitsch       ierr = PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors))));CHKERRQ(ierr);
12821ce71dffSSatish Balay     }
1283f94ccd6cSHong Zhang #endif
12842128a86cSHong Zhang   }
1285e99be685SHong Zhang   /* clean up */
1286e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1287e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12885df89d91SHong Zhang   PetscFunctionReturn(0);
12895df89d91SHong Zhang }
12905df89d91SHong Zhang 
12916fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12925df89d91SHong Zhang {
12935df89d91SHong Zhang   PetscErrorCode      ierr;
12945df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1295e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
129681d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12975df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1298aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
12996718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13006718818eSStefano Zampini   Mat_Product         *product = C->product;
13015df89d91SHong Zhang 
13025df89d91SHong Zhang   PetscFunctionBegin;
1303*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
13046718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
1305*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
130658ed3ceaSHong Zhang   /* clear old values in C */
130758ed3ceaSHong Zhang   if (!c->a) {
1308580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
130958ed3ceaSHong Zhang     c->a      = ca;
131058ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
131158ed3ceaSHong Zhang   } else {
131258ed3ceaSHong Zhang     ca =  c->a;
1313580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
131458ed3ceaSHong Zhang   }
131558ed3ceaSHong Zhang 
131640192850SHong Zhang   if (abt->usecoloring) {
131740192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
131840192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1319c8db22f6SHong Zhang 
1320b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
132140192850SHong Zhang     Bt_dense = abt->Bt_den;
1322b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1323c8db22f6SHong Zhang 
1324c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13252128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1326c8db22f6SHong Zhang 
13272128a86cSHong Zhang     /* Recover C from C_dense */
1328b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1329c8db22f6SHong Zhang     PetscFunctionReturn(0);
1330c8db22f6SHong Zhang   }
1331c8db22f6SHong Zhang 
13321fa1209cSHong Zhang   for (i=0; i<cm; i++) {
133381d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
133481d82fe4SHong Zhang     acol = aj + ai[i];
13356973769bSHong Zhang     aval = aa + ai[i];
13361fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
13371fa1209cSHong Zhang     ccol = cj + ci[i];
13386973769bSHong Zhang     cval = ca + ci[i];
13391fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
134081d82fe4SHong Zhang       brow = ccol[j];
134181d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
134281d82fe4SHong Zhang       bcol = bj + bi[brow];
13436973769bSHong Zhang       bval = ba + bi[brow];
13446973769bSHong Zhang 
134581d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
134681d82fe4SHong Zhang       nexta = 0; nextb = 0;
134781d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
13487b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
134981d82fe4SHong Zhang         if (nexta == anzi) break;
13507b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
135181d82fe4SHong Zhang         if (nextb == bnzj) break;
135281d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13536973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
135481d82fe4SHong Zhang           nexta++; nextb++;
135581d82fe4SHong Zhang           flops += 2;
13561fa1209cSHong Zhang         }
13571fa1209cSHong Zhang       }
135881d82fe4SHong Zhang     }
135981d82fe4SHong Zhang   }
136081d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
136181d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
136281d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
13635df89d91SHong Zhang   PetscFunctionReturn(0);
13645df89d91SHong Zhang }
13655df89d91SHong Zhang 
13666718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
13676d373c3eSHong Zhang {
13686d373c3eSHong Zhang   PetscErrorCode      ierr;
13696718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
13706d373c3eSHong Zhang 
13716d373c3eSHong Zhang   PetscFunctionBegin;
13726d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
13736718818eSStefano Zampini   if (atb->destroy) {
13746718818eSStefano Zampini     ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr);
13756473ade5SStefano Zampini   }
13766d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
13776d373c3eSHong Zhang   PetscFunctionReturn(0);
13786d373c3eSHong Zhang }
13796d373c3eSHong Zhang 
13804222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
13815df89d91SHong Zhang {
1382bc011b1eSHong Zhang   PetscErrorCode ierr;
1383089a957eSStefano Zampini   Mat            At = NULL;
138438baddfdSBarry Smith   PetscInt       *ati,*atj;
13854222ddf1SHong Zhang   Mat_Product    *product = C->product;
1386089a957eSStefano Zampini   PetscBool      flg,def,square;
1387bc011b1eSHong Zhang 
1388bc011b1eSHong Zhang   PetscFunctionBegin;
1389089a957eSStefano Zampini   MatCheckProduct(C,4);
1390089a957eSStefano Zampini   square = (PetscBool)(A == B && A->symmetric && A->symmetric_set);
13914222ddf1SHong Zhang   /* outerproduct */
1392089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr);
13934222ddf1SHong Zhang   if (flg) {
1394bc011b1eSHong Zhang     /* create symbolic At */
1395089a957eSStefano Zampini     if (!square) {
1396bc011b1eSHong Zhang       ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13970298fd71SBarry Smith       ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
139833d57670SJed Brown       ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
139904b858e0SBarry Smith       ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1400089a957eSStefano Zampini     }
1401bc011b1eSHong Zhang     /* get symbolic C=At*B */
14027a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1403089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
1404bc011b1eSHong Zhang 
1405bc011b1eSHong Zhang     /* clean up */
1406089a957eSStefano Zampini     if (!square) {
14076bf464f9SBarry Smith       ierr = MatDestroy(&At);CHKERRQ(ierr);
1408bc011b1eSHong Zhang       ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1409089a957eSStefano Zampini     }
14106d373c3eSHong Zhang 
14114222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14127a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr);
14134222ddf1SHong Zhang     PetscFunctionReturn(0);
14144222ddf1SHong Zhang   }
14154222ddf1SHong Zhang 
14164222ddf1SHong Zhang   /* matmatmult */
1417089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr);
1418089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr);
1419089a957eSStefano Zampini   if (flg || def) {
14204222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14214222ddf1SHong Zhang 
1422*2c71b3e2SJacob Faibussowitsch     PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
14234222ddf1SHong Zhang     ierr = PetscNew(&atb);CHKERRQ(ierr);
1424089a957eSStefano Zampini     if (!square) {
14254222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1426089a957eSStefano Zampini     }
14277a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1428089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
14296718818eSStefano Zampini     ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr);
14306718818eSStefano Zampini     product->data    = atb;
14316718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14324222ddf1SHong Zhang     atb->At          = At;
14334222ddf1SHong Zhang     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
14344222ddf1SHong Zhang 
14354222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14364222ddf1SHong Zhang     PetscFunctionReturn(0);
14374222ddf1SHong Zhang   }
14384222ddf1SHong Zhang 
14394222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1440bc011b1eSHong Zhang }
1441bc011b1eSHong Zhang 
144275648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1443bc011b1eSHong Zhang {
1444bc011b1eSHong Zhang   PetscErrorCode ierr;
14450fbc74f4SHong Zhang   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1446d0f46423SBarry Smith   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1447d0f46423SBarry Smith   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1448d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1449aa1aec99SHong Zhang   MatScalar      *aa=a->a,*ba,*ca,*caj;
1450bc011b1eSHong Zhang 
1451bc011b1eSHong Zhang   PetscFunctionBegin;
1452aa1aec99SHong Zhang   if (!c->a) {
1453580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
14542205254eSKarl Rupp 
1455aa1aec99SHong Zhang     c->a      = ca;
1456aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1457aa1aec99SHong Zhang   } else {
1458aa1aec99SHong Zhang     ca   = c->a;
1459580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1460aa1aec99SHong Zhang   }
1461bc011b1eSHong Zhang 
1462bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1463bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1464bc011b1eSHong Zhang     bj   = b->j + bi[i];
1465bc011b1eSHong Zhang     ba   = b->a + bi[i];
1466bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1467bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1468bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1469bc011b1eSHong Zhang       nextb = 0;
14700fbc74f4SHong Zhang       crow  = *aj++;
1471bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1472bc011b1eSHong Zhang       caj   = ca + ci[crow];
1473bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1474bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14750fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14760fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1477bc011b1eSHong Zhang           nextb++;
1478bc011b1eSHong Zhang         }
1479bc011b1eSHong Zhang       }
1480bc011b1eSHong Zhang       flops += 2*bnzi;
14810fbc74f4SHong Zhang       aa++;
1482bc011b1eSHong Zhang     }
1483bc011b1eSHong Zhang   }
1484bc011b1eSHong Zhang 
1485bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1486bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1487bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1488bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1489bc011b1eSHong Zhang   PetscFunctionReturn(0);
1490bc011b1eSHong Zhang }
14919123193aSHong Zhang 
14924222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
14939123193aSHong Zhang {
14949123193aSHong Zhang   PetscErrorCode ierr;
14959123193aSHong Zhang 
14969123193aSHong Zhang   PetscFunctionBegin;
14975a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14984222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14999123193aSHong Zhang   PetscFunctionReturn(0);
15009123193aSHong Zhang }
15019123193aSHong Zhang 
150293aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
15039123193aSHong Zhang {
1504f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1505612438f1SStefano Zampini   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
15061ca9667aSStefano Zampini   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
15079123193aSHong Zhang   PetscErrorCode    ierr;
15081ca9667aSStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1509a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1510daf57211SHong Zhang   const PetscInt    *aj;
1511612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
15121ca9667aSStefano Zampini   PetscInt          clda=cd->lda;
15131ca9667aSStefano Zampini   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
15149123193aSHong Zhang 
15159123193aSHong Zhang   PetscFunctionBegin;
1516f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1517a4af7ca8SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
151893aa15f2SStefano Zampini   if (add) {
15198c778c55SBarry Smith     ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
152093aa15f2SStefano Zampini   } else {
152193aa15f2SStefano Zampini     ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr);
152293aa15f2SStefano Zampini   }
1523a4af7ca8SStefano Zampini   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1524f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
15251ca9667aSStefano Zampini   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
15261ca9667aSStefano Zampini   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
15271ca9667aSStefano Zampini     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1528f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1529f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1530f32d5d43SBarry Smith       aj = a->j + a->i[i];
1531a4af7ca8SStefano Zampini       aa = av + a->i[i];
1532f32d5d43SBarry Smith       for (j=0; j<n; j++) {
15331ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15341ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1535730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1536730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1537730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1538730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
15399123193aSHong Zhang       }
154093aa15f2SStefano Zampini       if (add) {
154187753ddeSHong Zhang         c1[i] += r1;
154287753ddeSHong Zhang         c2[i] += r2;
154387753ddeSHong Zhang         c3[i] += r3;
154487753ddeSHong Zhang         c4[i] += r4;
154593aa15f2SStefano Zampini       } else {
154693aa15f2SStefano Zampini         c1[i] = r1;
154793aa15f2SStefano Zampini         c2[i] = r2;
154893aa15f2SStefano Zampini         c3[i] = r3;
154993aa15f2SStefano Zampini         c4[i] = r4;
155093aa15f2SStefano Zampini       }
1551f32d5d43SBarry Smith     }
1552730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1553730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1554f32d5d43SBarry Smith   }
155593aa15f2SStefano Zampini   /* process remaining columns */
155693aa15f2SStefano Zampini   if (col != cn) {
155793aa15f2SStefano Zampini     PetscInt rc = cn-col;
155893aa15f2SStefano Zampini 
155993aa15f2SStefano Zampini     if (rc == 1) {
156093aa15f2SStefano Zampini       for (i=0; i<am; i++) {
1561f32d5d43SBarry Smith         r1 = 0.0;
1562f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1563f32d5d43SBarry Smith         aj = a->j + a->i[i];
1564a4af7ca8SStefano Zampini         aa = av + a->i[i];
156593aa15f2SStefano Zampini         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
156693aa15f2SStefano Zampini         if (add) c1[i] += r1;
156793aa15f2SStefano Zampini         else c1[i] = r1;
156893aa15f2SStefano Zampini       }
156993aa15f2SStefano Zampini     } else if (rc == 2) {
157093aa15f2SStefano Zampini       for (i=0; i<am; i++) {
157193aa15f2SStefano Zampini         r1 = r2 = 0.0;
157293aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
157393aa15f2SStefano Zampini         aj = a->j + a->i[i];
157493aa15f2SStefano Zampini         aa = av + a->i[i];
1575f32d5d43SBarry Smith         for (j=0; j<n; j++) {
157693aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
157793aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
157893aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
157993aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
1580f32d5d43SBarry Smith         }
158193aa15f2SStefano Zampini         if (add) {
158287753ddeSHong Zhang           c1[i] += r1;
158393aa15f2SStefano Zampini           c2[i] += r2;
158493aa15f2SStefano Zampini         } else {
158593aa15f2SStefano Zampini           c1[i] = r1;
158693aa15f2SStefano Zampini           c2[i] = r2;
1587f32d5d43SBarry Smith         }
158893aa15f2SStefano Zampini       }
158993aa15f2SStefano Zampini     } else {
159093aa15f2SStefano Zampini       for (i=0; i<am; i++) {
159193aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
159293aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
159393aa15f2SStefano Zampini         aj = a->j + a->i[i];
159493aa15f2SStefano Zampini         aa = av + a->i[i];
159593aa15f2SStefano Zampini         for (j=0; j<n; j++) {
159693aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
159793aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
159893aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
159993aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
160093aa15f2SStefano Zampini           r3 += aatmp*b3[ajtmp];
160193aa15f2SStefano Zampini         }
160293aa15f2SStefano Zampini         if (add) {
160393aa15f2SStefano Zampini           c1[i] += r1;
160493aa15f2SStefano Zampini           c2[i] += r2;
160593aa15f2SStefano Zampini           c3[i] += r3;
160693aa15f2SStefano Zampini         } else {
160793aa15f2SStefano Zampini           c1[i] = r1;
160893aa15f2SStefano Zampini           c2[i] = r2;
160993aa15f2SStefano Zampini           c3[i] = r3;
161093aa15f2SStefano Zampini         }
161193aa15f2SStefano Zampini       }
161293aa15f2SStefano Zampini     }
1613f32d5d43SBarry Smith   }
1614dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
161593aa15f2SStefano Zampini   if (add) {
16168c778c55SBarry Smith     ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
161793aa15f2SStefano Zampini   } else {
161893aa15f2SStefano Zampini     ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr);
161993aa15f2SStefano Zampini   }
1620a4af7ca8SStefano Zampini   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1621a4af7ca8SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1622f32d5d43SBarry Smith   PetscFunctionReturn(0);
1623f32d5d43SBarry Smith }
1624f32d5d43SBarry Smith 
162587753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1626f32d5d43SBarry Smith {
1627f32d5d43SBarry Smith   PetscErrorCode ierr;
1628f32d5d43SBarry Smith 
1629f32d5d43SBarry Smith   PetscFunctionBegin;
1630*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(B->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1631*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->rmap->n != C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1632*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(B->cmap->n != C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
16334324174eSBarry Smith 
163493aa15f2SStefano Zampini   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr);
16359123193aSHong Zhang   PetscFunctionReturn(0);
16369123193aSHong Zhang }
1637b1683b59SHong Zhang 
16384222ddf1SHong Zhang /* ------------------------------------------------------- */
16394222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
16404222ddf1SHong Zhang {
16414222ddf1SHong Zhang   PetscFunctionBegin;
16424222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16434222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16444222ddf1SHong Zhang   PetscFunctionReturn(0);
16454222ddf1SHong Zhang }
16464222ddf1SHong Zhang 
16476718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
16486718818eSStefano Zampini 
16494222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
16504222ddf1SHong Zhang {
16514222ddf1SHong Zhang   PetscFunctionBegin;
16526718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16534222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16546718818eSStefano Zampini   PetscFunctionReturn(0);
16556718818eSStefano Zampini }
16566718818eSStefano Zampini 
16576718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
16586718818eSStefano Zampini {
16596718818eSStefano Zampini   PetscFunctionBegin;
16606718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16616718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16624222ddf1SHong Zhang   PetscFunctionReturn(0);
16634222ddf1SHong Zhang }
16644222ddf1SHong Zhang 
16654222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
16664222ddf1SHong Zhang {
16674222ddf1SHong Zhang   PetscErrorCode ierr;
16684222ddf1SHong Zhang   Mat_Product    *product = C->product;
16694222ddf1SHong Zhang 
16704222ddf1SHong Zhang   PetscFunctionBegin;
16714222ddf1SHong Zhang   switch (product->type) {
16724222ddf1SHong Zhang   case MATPRODUCT_AB:
16734222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
16744222ddf1SHong Zhang     break;
16754222ddf1SHong Zhang   case MATPRODUCT_AtB:
16764222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
16774222ddf1SHong Zhang     break;
16786718818eSStefano Zampini   case MATPRODUCT_ABt:
16796718818eSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr);
16804222ddf1SHong Zhang     break;
16814222ddf1SHong Zhang   default:
16826718818eSStefano Zampini     break;
16834222ddf1SHong Zhang   }
16844222ddf1SHong Zhang   PetscFunctionReturn(0);
16854222ddf1SHong Zhang }
16864222ddf1SHong Zhang /* ------------------------------------------------------- */
16874222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
16884222ddf1SHong Zhang {
16894222ddf1SHong Zhang   PetscErrorCode ierr;
16904222ddf1SHong Zhang   Mat_Product    *product = C->product;
16914222ddf1SHong Zhang   Mat            A = product->A;
16924222ddf1SHong Zhang   PetscBool      baij;
16934222ddf1SHong Zhang 
16944222ddf1SHong Zhang   PetscFunctionBegin;
16954222ddf1SHong Zhang   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
16964222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
16974222ddf1SHong Zhang     PetscBool sbaij;
16984222ddf1SHong Zhang     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
1699*2c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
17004222ddf1SHong Zhang 
17014222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17024222ddf1SHong Zhang   } else { /* A is seqbaij */
17034222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17044222ddf1SHong Zhang   }
17054222ddf1SHong Zhang 
17064222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17074222ddf1SHong Zhang   PetscFunctionReturn(0);
17084222ddf1SHong Zhang }
17094222ddf1SHong Zhang 
17104222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
17114222ddf1SHong Zhang {
17124222ddf1SHong Zhang   PetscErrorCode ierr;
17134222ddf1SHong Zhang   Mat_Product    *product = C->product;
17144222ddf1SHong Zhang 
17154222ddf1SHong Zhang   PetscFunctionBegin;
17166718818eSStefano Zampini   MatCheckProduct(C,1);
1717*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
17186718818eSStefano Zampini   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
17194222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
17206718818eSStefano Zampini   }
17214222ddf1SHong Zhang   PetscFunctionReturn(0);
17224222ddf1SHong Zhang }
17236718818eSStefano Zampini 
17244222ddf1SHong Zhang /* ------------------------------------------------------- */
17254222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
17264222ddf1SHong Zhang {
17274222ddf1SHong Zhang   PetscFunctionBegin;
17284222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17294222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17304222ddf1SHong Zhang   PetscFunctionReturn(0);
17314222ddf1SHong Zhang }
17324222ddf1SHong Zhang 
17334222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
17344222ddf1SHong Zhang {
17354222ddf1SHong Zhang   PetscErrorCode ierr;
17364222ddf1SHong Zhang   Mat_Product    *product = C->product;
17374222ddf1SHong Zhang 
17384222ddf1SHong Zhang   PetscFunctionBegin;
17394222ddf1SHong Zhang   if (product->type == MATPRODUCT_AB) {
17404222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
17416718818eSStefano Zampini   }
17424222ddf1SHong Zhang   PetscFunctionReturn(0);
17434222ddf1SHong Zhang }
17444222ddf1SHong Zhang /* ------------------------------------------------------- */
17454222ddf1SHong Zhang 
1746b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1747c8db22f6SHong Zhang {
1748c8db22f6SHong Zhang   PetscErrorCode ierr;
17492f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
17502f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
17512f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
17522f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
17532f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
17542f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1755c8db22f6SHong Zhang 
1756c8db22f6SHong Zhang   PetscFunctionBegin;
17572f5f1f90SHong Zhang   btval_den=btdense->v;
1758580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
17592f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
17602f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17612f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1762d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17632f5f1f90SHong Zhang       btcol = bj + bi[col];
17642f5f1f90SHong Zhang       btval = ba + bi[col];
17652f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1766d2853540SHong Zhang       for (j=0; j<anz; j++) {
17672f5f1f90SHong Zhang         brow            = btcol[j];
17682f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1769c8db22f6SHong Zhang       }
1770c8db22f6SHong Zhang     }
17712f5f1f90SHong Zhang     btval_den += m;
1772c8db22f6SHong Zhang   }
1773c8db22f6SHong Zhang   PetscFunctionReturn(0);
1774c8db22f6SHong Zhang }
1775c8db22f6SHong Zhang 
1776b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
17778972f759SHong Zhang {
17788972f759SHong Zhang   PetscErrorCode    ierr;
1779b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
17801683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
17811683a169SBarry Smith   PetscScalar       *ca=csp->a;
1782f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1783e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1784077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1785077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
17868972f759SHong Zhang 
17878972f759SHong Zhang   PetscFunctionBegin;
17881683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1789f99a636bSHong Zhang 
1790077f23c2SHong Zhang   if (brows > 0) {
1791077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1792980ae229SHong Zhang     lstart = matcoloring->lstart;
1793580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1794eeb4fd02SHong Zhang 
1795077f23c2SHong Zhang     row_end = brows;
1796eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1797077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1798077f23c2SHong Zhang       ca_den_ptr = ca_den;
1799980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1800eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1801eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1802eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1803eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1804eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1805eeb4fd02SHong Zhang             lstart[k] = l;
1806eeb4fd02SHong Zhang             break;
1807eeb4fd02SHong Zhang           } else {
1808077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1809eeb4fd02SHong Zhang           }
1810eeb4fd02SHong Zhang         }
1811077f23c2SHong Zhang         ca_den_ptr += m;
1812eeb4fd02SHong Zhang       }
1813077f23c2SHong Zhang       row_end += brows;
1814eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1815eeb4fd02SHong Zhang     }
1816077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1817077f23c2SHong Zhang     ca_den_ptr = ca_den;
1818077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1819077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1820077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1821077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1822077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1823077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1824077f23c2SHong Zhang       }
1825077f23c2SHong Zhang       ca_den_ptr += m;
1826077f23c2SHong Zhang     }
1827f99a636bSHong Zhang   }
1828f99a636bSHong Zhang 
18291683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1830f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1831077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18327d3de750SJacob Faibussowitsch     ierr = PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows);CHKERRQ(ierr);
1833e88777f2SHong Zhang   } else {
1834077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1835e88777f2SHong Zhang   }
1836f99a636bSHong Zhang #endif
18378972f759SHong Zhang   PetscFunctionReturn(0);
18388972f759SHong Zhang }
18398972f759SHong Zhang 
1840b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1841b1683b59SHong Zhang {
1842b1683b59SHong Zhang   PetscErrorCode ierr;
1843e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
18441a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1845b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1846b1683b59SHong Zhang   IS             *isa;
1847b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1848e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1849e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1850e88777f2SHong Zhang   PetscBool      flg;
1851b1683b59SHong Zhang 
1852b1683b59SHong Zhang   PetscFunctionBegin;
1853071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1854e99be685SHong Zhang 
18557ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1856e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1857b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1858e88777f2SHong Zhang   c->N      = Nbs;
1859e88777f2SHong Zhang   c->m      = c->M;
1860b1683b59SHong Zhang   c->rstart = 0;
1861e88777f2SHong Zhang   c->brows  = 100;
1862b1683b59SHong Zhang 
1863b1683b59SHong Zhang   c->ncolors = nis;
1864dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1865854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1866854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1867e88777f2SHong Zhang 
1868e88777f2SHong Zhang   brows = c->brows;
1869c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1870e88777f2SHong Zhang   if (flg) c->brows = brows;
1871eeb4fd02SHong Zhang   if (brows > 0) {
1872854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1873e88777f2SHong Zhang   }
18742205254eSKarl Rupp 
1875d2853540SHong Zhang   colorforrow[0] = 0;
1876d2853540SHong Zhang   rows_i         = rows;
1877f99a636bSHong Zhang   den2sp_i       = den2sp;
1878b1683b59SHong Zhang 
1879854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1880854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
18812205254eSKarl Rupp 
1882d2853540SHong Zhang   colorforcol[0] = 0;
1883d2853540SHong Zhang   columns_i      = columns;
1884d2853540SHong Zhang 
1885077f23c2SHong Zhang   /* get column-wise storage of mat */
1886077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1887b1683b59SHong Zhang 
1888b28d80bdSHong Zhang   cm   = c->m;
1889854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1890854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1891f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1892b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1893b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
18942205254eSKarl Rupp 
1895b1683b59SHong Zhang     c->ncolumns[i] = n;
1896b1683b59SHong Zhang     if (n) {
1897580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1898b1683b59SHong Zhang     }
1899d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1900d2853540SHong Zhang     columns_i       += n;
1901b1683b59SHong Zhang 
1902b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1903580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1904e99be685SHong Zhang 
1905b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1906b1683b59SHong Zhang       col     = is[j];
1907d2853540SHong Zhang       row_idx = cj + ci[col];
1908b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1909b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1910e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1911d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1912b1683b59SHong Zhang       }
1913b1683b59SHong Zhang     }
1914b1683b59SHong Zhang     /* count the number of hits */
1915b1683b59SHong Zhang     nrows = 0;
1916e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1917b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1918b1683b59SHong Zhang     }
1919b1683b59SHong Zhang     c->nrows[i]      = nrows;
1920d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1921d2853540SHong Zhang 
1922b1683b59SHong Zhang     nrows = 0;
1923b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1924b1683b59SHong Zhang       if (rowhit[j]) {
1925d2853540SHong Zhang         rows_i[nrows]   = j;
192612b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1927b1683b59SHong Zhang         nrows++;
1928b1683b59SHong Zhang       }
1929b1683b59SHong Zhang     }
1930e88777f2SHong Zhang     den2sp_i += nrows;
1931077f23c2SHong Zhang 
1932b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1933f99a636bSHong Zhang     rows_i += nrows;
1934b1683b59SHong Zhang   }
19350298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1936b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1937071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1938*2c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1939b28d80bdSHong Zhang 
1940d2853540SHong Zhang   c->colorforrow = colorforrow;
1941d2853540SHong Zhang   c->rows        = rows;
1942f99a636bSHong Zhang   c->den2sp      = den2sp;
1943d2853540SHong Zhang   c->colorforcol = colorforcol;
1944d2853540SHong Zhang   c->columns     = columns;
19452205254eSKarl Rupp 
1946f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1947b1683b59SHong Zhang   PetscFunctionReturn(0);
1948b1683b59SHong Zhang }
1949d013fc79Sandi selinger 
19504222ddf1SHong Zhang /* --------------------------------------------------------------- */
19514222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1952df97dc6dSFande Kong {
19534222ddf1SHong Zhang   PetscErrorCode ierr;
19544222ddf1SHong Zhang   Mat_Product    *product = C->product;
19554222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19564222ddf1SHong Zhang 
1957df97dc6dSFande Kong   PetscFunctionBegin;
19584222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19594222ddf1SHong Zhang     /* Alg: "outerproduct" */
19606718818eSStefano Zampini     ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
19614222ddf1SHong Zhang   } else {
19624222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19636718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19646718818eSStefano Zampini     Mat                 At;
19654222ddf1SHong Zhang 
1966*2c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
19676718818eSStefano Zampini     At = atb->At;
1968089a957eSStefano Zampini     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
19694222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
19704222ddf1SHong Zhang     }
1971089a957eSStefano Zampini     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr);
19724222ddf1SHong Zhang     atb->updateAt = PETSC_TRUE;
19734222ddf1SHong Zhang   }
1974df97dc6dSFande Kong   PetscFunctionReturn(0);
1975df97dc6dSFande Kong }
1976df97dc6dSFande Kong 
19774222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1978d013fc79Sandi selinger {
1979d013fc79Sandi selinger   PetscErrorCode ierr;
19804222ddf1SHong Zhang   Mat_Product    *product = C->product;
19814222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19824222ddf1SHong Zhang   PetscReal      fill=product->fill;
1983d013fc79Sandi selinger 
1984d013fc79Sandi selinger   PetscFunctionBegin;
19854222ddf1SHong Zhang   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
19862869b61bSandi selinger 
19874222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19884222ddf1SHong Zhang   PetscFunctionReturn(0);
19892869b61bSandi selinger }
1990d013fc79Sandi selinger 
19914222ddf1SHong Zhang /* --------------------------------------------------------------- */
19924222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
19934222ddf1SHong Zhang {
19944222ddf1SHong Zhang   PetscErrorCode ierr;
19954222ddf1SHong Zhang   Mat_Product    *product = C->product;
19964222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19974222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19984222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19994222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
20004222ddf1SHong Zhang   PetscInt       nalg = 7;
20014222ddf1SHong Zhang #else
20024222ddf1SHong Zhang   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
20034222ddf1SHong Zhang   PetscInt       nalg = 8;
20044222ddf1SHong Zhang #endif
20054222ddf1SHong Zhang 
20064222ddf1SHong Zhang   PetscFunctionBegin;
20074222ddf1SHong Zhang   /* Set default algorithm */
20084222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20094222ddf1SHong Zhang   if (flg) {
20104222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2011d013fc79Sandi selinger   }
2012d013fc79Sandi selinger 
20134222ddf1SHong Zhang   /* Get runtime option */
20144222ddf1SHong Zhang   if (product->api_user) {
20154222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
20164222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20174222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20184222ddf1SHong Zhang   } else {
20194222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
20203e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20214222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2022d013fc79Sandi selinger   }
20234222ddf1SHong Zhang   if (flg) {
20244222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2025d013fc79Sandi selinger   }
2026d013fc79Sandi selinger 
20274222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20284222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20294222ddf1SHong Zhang   PetscFunctionReturn(0);
20304222ddf1SHong Zhang }
2031d013fc79Sandi selinger 
20324222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
20334222ddf1SHong Zhang {
20344222ddf1SHong Zhang   PetscErrorCode ierr;
20354222ddf1SHong Zhang   Mat_Product    *product = C->product;
20364222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20374222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
2038089a957eSStefano Zampini   const char     *algTypes[3] = {"default","at*b","outerproduct"};
2039089a957eSStefano Zampini   PetscInt       nalg = 3;
2040d013fc79Sandi selinger 
20414222ddf1SHong Zhang   PetscFunctionBegin;
20424222ddf1SHong Zhang   /* Get runtime option */
20434222ddf1SHong Zhang   if (product->api_user) {
20444222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
20454222ddf1SHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20464222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20474222ddf1SHong Zhang   } else {
20484222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
20493e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20504222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20514222ddf1SHong Zhang   }
20524222ddf1SHong Zhang   if (flg) {
20534222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20544222ddf1SHong Zhang   }
2055d013fc79Sandi selinger 
20564222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20574222ddf1SHong Zhang   PetscFunctionReturn(0);
20584222ddf1SHong Zhang }
20594222ddf1SHong Zhang 
20604222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
20614222ddf1SHong Zhang {
20624222ddf1SHong Zhang   PetscErrorCode ierr;
20634222ddf1SHong Zhang   Mat_Product    *product = C->product;
20644222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20654222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20664222ddf1SHong Zhang   const char     *algTypes[2] = {"default","color"};
20674222ddf1SHong Zhang   PetscInt       nalg = 2;
20684222ddf1SHong Zhang 
20694222ddf1SHong Zhang   PetscFunctionBegin;
20704222ddf1SHong Zhang   /* Set default algorithm */
20714222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20724222ddf1SHong Zhang   if (!flg) {
20734222ddf1SHong Zhang     alg = 1;
20744222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20754222ddf1SHong Zhang   }
20764222ddf1SHong Zhang 
20774222ddf1SHong Zhang   /* Get runtime option */
20784222ddf1SHong Zhang   if (product->api_user) {
20794222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
20804222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20814222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20824222ddf1SHong Zhang   } else {
20834222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
20843e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20854222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20864222ddf1SHong Zhang   }
20874222ddf1SHong Zhang   if (flg) {
20884222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20894222ddf1SHong Zhang   }
20904222ddf1SHong Zhang 
20914222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20924222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20934222ddf1SHong Zhang   PetscFunctionReturn(0);
20944222ddf1SHong Zhang }
20954222ddf1SHong Zhang 
20964222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
20974222ddf1SHong Zhang {
20984222ddf1SHong Zhang   PetscErrorCode ierr;
20994222ddf1SHong Zhang   Mat_Product    *product = C->product;
21004222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21014222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
21024222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
21034222ddf1SHong Zhang   const char     *algTypes[2] = {"scalable","rap"};
21044222ddf1SHong Zhang   PetscInt       nalg = 2;
21054222ddf1SHong Zhang #else
21064222ddf1SHong Zhang   const char     *algTypes[3] = {"scalable","rap","hypre"};
21074222ddf1SHong Zhang   PetscInt       nalg = 3;
21084222ddf1SHong Zhang #endif
21094222ddf1SHong Zhang 
21104222ddf1SHong Zhang   PetscFunctionBegin;
21114222ddf1SHong Zhang   /* Set default algorithm */
21124222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21134222ddf1SHong Zhang   if (flg) {
21144222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21154222ddf1SHong Zhang   }
21164222ddf1SHong Zhang 
21174222ddf1SHong Zhang   /* Get runtime option */
21184222ddf1SHong Zhang   if (product->api_user) {
21194222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
21204222ddf1SHong Zhang     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21214222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21224222ddf1SHong Zhang   } else {
21234222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
21243e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21254222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21264222ddf1SHong Zhang   }
21274222ddf1SHong Zhang   if (flg) {
21284222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21294222ddf1SHong Zhang   }
21304222ddf1SHong Zhang 
21314222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21324222ddf1SHong Zhang   PetscFunctionReturn(0);
21334222ddf1SHong Zhang }
21344222ddf1SHong Zhang 
21354222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
21364222ddf1SHong Zhang {
21374222ddf1SHong Zhang   PetscErrorCode ierr;
21384222ddf1SHong Zhang   Mat_Product    *product = C->product;
21394222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21404222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21414222ddf1SHong Zhang   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
21424222ddf1SHong Zhang   PetscInt        nalg = 3;
21434222ddf1SHong Zhang 
21444222ddf1SHong Zhang   PetscFunctionBegin;
21454222ddf1SHong Zhang   /* Set default algorithm */
21464222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21474222ddf1SHong Zhang   if (flg) {
21484222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21494222ddf1SHong Zhang   }
21504222ddf1SHong Zhang 
21514222ddf1SHong Zhang   /* Get runtime option */
21524222ddf1SHong Zhang   if (product->api_user) {
21534222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
21544222ddf1SHong Zhang     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21554222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21564222ddf1SHong Zhang   } else {
21574222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
21583e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21594222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21604222ddf1SHong Zhang   }
21614222ddf1SHong Zhang   if (flg) {
21624222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21634222ddf1SHong Zhang   }
21644222ddf1SHong Zhang 
21654222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21664222ddf1SHong Zhang   PetscFunctionReturn(0);
21674222ddf1SHong Zhang }
21684222ddf1SHong Zhang 
21694222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
21704222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
21714222ddf1SHong Zhang {
21724222ddf1SHong Zhang   PetscErrorCode ierr;
21734222ddf1SHong Zhang   Mat_Product    *product = C->product;
21744222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21754222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21764222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
21774222ddf1SHong Zhang   PetscInt       nalg = 7;
21784222ddf1SHong Zhang 
21794222ddf1SHong Zhang   PetscFunctionBegin;
21804222ddf1SHong Zhang   /* Set default algorithm */
21814222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21824222ddf1SHong Zhang   if (flg) {
21834222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21844222ddf1SHong Zhang   }
21854222ddf1SHong Zhang 
21864222ddf1SHong Zhang   /* Get runtime option */
21874222ddf1SHong Zhang   if (product->api_user) {
21884222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
21894222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21904222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21914222ddf1SHong Zhang   } else {
21924222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
21933e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21944222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21954222ddf1SHong Zhang   }
21964222ddf1SHong Zhang   if (flg) {
21974222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21984222ddf1SHong Zhang   }
21994222ddf1SHong Zhang 
22004222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
22014222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
22024222ddf1SHong Zhang   PetscFunctionReturn(0);
22034222ddf1SHong Zhang }
22044222ddf1SHong Zhang 
22054222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
22064222ddf1SHong Zhang {
22074222ddf1SHong Zhang   PetscErrorCode ierr;
22084222ddf1SHong Zhang   Mat_Product    *product = C->product;
22094222ddf1SHong Zhang 
22104222ddf1SHong Zhang   PetscFunctionBegin;
22114222ddf1SHong Zhang   switch (product->type) {
22124222ddf1SHong Zhang   case MATPRODUCT_AB:
22134222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
22144222ddf1SHong Zhang     break;
22154222ddf1SHong Zhang   case MATPRODUCT_AtB:
22164222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
22174222ddf1SHong Zhang     break;
22184222ddf1SHong Zhang   case MATPRODUCT_ABt:
22194222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
22204222ddf1SHong Zhang     break;
22214222ddf1SHong Zhang   case MATPRODUCT_PtAP:
22224222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
22234222ddf1SHong Zhang     break;
22244222ddf1SHong Zhang   case MATPRODUCT_RARt:
22254222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
22264222ddf1SHong Zhang     break;
22274222ddf1SHong Zhang   case MATPRODUCT_ABC:
22284222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
22294222ddf1SHong Zhang     break;
22306718818eSStefano Zampini   default:
22316718818eSStefano Zampini     break;
22324222ddf1SHong Zhang   }
2233d013fc79Sandi selinger   PetscFunctionReturn(0);
2234d013fc79Sandi selinger }
2235