xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 8c78258c41e3973cbc851ab31b007f29808f1954)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14df97dc6dSFande Kong {
15df97dc6dSFande Kong   PetscErrorCode ierr;
16df97dc6dSFande Kong 
17df97dc6dSFande Kong   PetscFunctionBegin;
18df97dc6dSFande Kong   if (C->ops->matmultnumeric) {
196718818eSStefano Zampini     if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
20df97dc6dSFande Kong     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
21df97dc6dSFande Kong   } else {
22df97dc6dSFande Kong     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
23df97dc6dSFande Kong   }
24df97dc6dSFande Kong   PetscFunctionReturn(0);
25df97dc6dSFande Kong }
26df97dc6dSFande Kong 
274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
29df97dc6dSFande Kong {
30df97dc6dSFande Kong   PetscErrorCode ierr;
314222ddf1SHong Zhang   PetscInt       ii;
324222ddf1SHong Zhang   Mat_SeqAIJ     *aij;
33e4e71118SStefano Zampini   PetscBool      isseqaij;
345c66b693SKris Buschelman 
355c66b693SKris Buschelman   PetscFunctionBegin;
364222ddf1SHong Zhang   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
374222ddf1SHong Zhang   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
384222ddf1SHong Zhang 
39e4e71118SStefano Zampini   if (!mtype) {
40e4e71118SStefano Zampini     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
41e4e71118SStefano Zampini     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
42e4e71118SStefano Zampini   } else {
43e4e71118SStefano Zampini     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
44e4e71118SStefano Zampini   }
45f4259b30SLisandro Dalcin   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
464222ddf1SHong Zhang   aij  = (Mat_SeqAIJ*)(mat)->data;
474222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
484222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
494222ddf1SHong Zhang 
504222ddf1SHong Zhang   aij->i            = i;
514222ddf1SHong Zhang   aij->j            = j;
524222ddf1SHong Zhang   aij->a            = a;
534222ddf1SHong Zhang   aij->singlemalloc = PETSC_FALSE;
544222ddf1SHong Zhang   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
554222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
564222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
574222ddf1SHong Zhang 
584222ddf1SHong Zhang   for (ii=0; ii<m; ii++) {
594222ddf1SHong Zhang     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
6025023028SHong Zhang   }
614222ddf1SHong Zhang 
625c66b693SKris Buschelman   PetscFunctionReturn(0);
635c66b693SKris Buschelman }
641c24bd37SHong Zhang 
654222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
664222ddf1SHong Zhang {
674222ddf1SHong Zhang   PetscErrorCode      ierr;
684222ddf1SHong Zhang   Mat_Product         *product = C->product;
694222ddf1SHong Zhang   MatProductAlgorithm alg;
704222ddf1SHong Zhang   PetscBool           flg;
714222ddf1SHong Zhang 
724222ddf1SHong Zhang   PetscFunctionBegin;
734222ddf1SHong Zhang   if (product) {
744222ddf1SHong Zhang     alg = product->alg;
754222ddf1SHong Zhang   } else {
764222ddf1SHong Zhang     alg = "sorted";
774222ddf1SHong Zhang   }
784222ddf1SHong Zhang   /* sorted */
794222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
804222ddf1SHong Zhang   if (flg) {
814222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
824222ddf1SHong Zhang     PetscFunctionReturn(0);
834222ddf1SHong Zhang   }
844222ddf1SHong Zhang 
854222ddf1SHong Zhang   /* scalable */
864222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
874222ddf1SHong Zhang   if (flg) {
884222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
894222ddf1SHong Zhang     PetscFunctionReturn(0);
904222ddf1SHong Zhang   }
914222ddf1SHong Zhang 
924222ddf1SHong Zhang   /* scalable_fast */
934222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
944222ddf1SHong Zhang   if (flg) {
954222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
964222ddf1SHong Zhang     PetscFunctionReturn(0);
974222ddf1SHong Zhang   }
984222ddf1SHong Zhang 
994222ddf1SHong Zhang   /* heap */
1004222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
1014222ddf1SHong Zhang   if (flg) {
1024222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
1034222ddf1SHong Zhang     PetscFunctionReturn(0);
1044222ddf1SHong Zhang   }
1054222ddf1SHong Zhang 
1064222ddf1SHong Zhang   /* btheap */
1074222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
1084222ddf1SHong Zhang   if (flg) {
1094222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
1104222ddf1SHong Zhang     PetscFunctionReturn(0);
1114222ddf1SHong Zhang   }
1124222ddf1SHong Zhang 
1134222ddf1SHong Zhang   /* llcondensed */
1144222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
1154222ddf1SHong Zhang   if (flg) {
1164222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
1174222ddf1SHong Zhang     PetscFunctionReturn(0);
1184222ddf1SHong Zhang   }
1194222ddf1SHong Zhang 
1204222ddf1SHong Zhang   /* rowmerge */
1214222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
1224222ddf1SHong Zhang   if (flg) {
1234222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
1244222ddf1SHong Zhang     PetscFunctionReturn(0);
1254222ddf1SHong Zhang   }
1264222ddf1SHong Zhang 
1274222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1284222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
1294222ddf1SHong Zhang   if (flg) {
1304222ddf1SHong Zhang     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
1314222ddf1SHong Zhang     PetscFunctionReturn(0);
1324222ddf1SHong Zhang   }
1334222ddf1SHong Zhang #endif
1344222ddf1SHong Zhang 
1354222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1364222ddf1SHong Zhang }
1374222ddf1SHong Zhang 
1384222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
139b561aa9dSHong Zhang {
140b561aa9dSHong Zhang   PetscErrorCode     ierr;
141b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
142971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
143c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
144b561aa9dSHong Zhang   PetscReal          afill;
145eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
146eca6b86aSHong Zhang   PetscTable         ta;
147fb908031SHong Zhang   PetscBT            lnkbt;
1480298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
149b561aa9dSHong Zhang 
150b561aa9dSHong Zhang   PetscFunctionBegin;
151bd958071SHong Zhang   /* Get ci and cj */
152bd958071SHong Zhang   /*---------------*/
153fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
154fb908031SHong Zhang   /* free space for accumulating nonzero column info */
155854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
156fb908031SHong Zhang   ci[0] = 0;
157fb908031SHong Zhang 
158fb908031SHong Zhang   /* create and initialize a linked list */
159c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
160c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
161eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
162eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
163eca6b86aSHong Zhang 
164eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
165fb908031SHong Zhang 
166fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
167f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1682205254eSKarl Rupp 
169fb908031SHong Zhang   current_space = free_space;
170fb908031SHong Zhang 
171fb908031SHong Zhang   /* Determine ci and cj */
172fb908031SHong Zhang   for (i=0; i<am; i++) {
173fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
174fb908031SHong Zhang     aj   = a->j + ai[i];
175fb908031SHong Zhang     for (j=0; j<anzi; j++) {
176fb908031SHong Zhang       brow = aj[j];
177fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
178fb908031SHong Zhang       bj   = b->j + bi[brow];
179fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
180fb908031SHong Zhang       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
181fb908031SHong Zhang     }
182*8c78258cSHong Zhang     /* add possible missing diagonal entry */
183*8c78258cSHong Zhang     if (C->force_diagonals) {
184*8c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr);
185*8c78258cSHong Zhang     }
186fb908031SHong Zhang     cnzi = lnk[0];
187fb908031SHong Zhang 
188fb908031SHong Zhang     /* If free space is not available, make more free space */
189fb908031SHong Zhang     /* Double the amount of total space in the list */
190fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
191f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
192fb908031SHong Zhang       ndouble++;
193fb908031SHong Zhang     }
194fb908031SHong Zhang 
195fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
196fb908031SHong Zhang     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1972205254eSKarl Rupp 
198fb908031SHong Zhang     current_space->array           += cnzi;
199fb908031SHong Zhang     current_space->local_used      += cnzi;
200fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2012205254eSKarl Rupp 
202fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
203fb908031SHong Zhang   }
204fb908031SHong Zhang 
205fb908031SHong Zhang   /* Column indices are in the list of free space */
206fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
207fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
208854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
209fb908031SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
210fb908031SHong Zhang   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
211b561aa9dSHong Zhang 
21226be0446SHong Zhang   /* put together the new symbolic matrix */
213e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
2144222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
21558c24d83SHong Zhang 
21658c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
21758c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2184222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
219aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
220e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22158c24d83SHong Zhang   c->nonew   = 0;
2224222ddf1SHong Zhang 
2234222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2244222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2250b7e3e3dSHong Zhang 
2267212da7cSHong Zhang   /* set MatInfo */
2277212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
228f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2297212da7cSHong Zhang   c->maxnz                  = ci[am];
2307212da7cSHong Zhang   c->nz                     = ci[am];
2314222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2324222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2334222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2347212da7cSHong Zhang 
2357212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2367212da7cSHong Zhang   if (ci[am]) {
2374222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
2384222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
239f2b054eeSHong Zhang   } else {
2404222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
241be0fcf8dSHong Zhang   }
242f2b054eeSHong Zhang #endif
24358c24d83SHong Zhang   PetscFunctionReturn(0);
24458c24d83SHong Zhang }
245d50806bdSBarry Smith 
246df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
247d50806bdSBarry Smith {
248dfbe8321SBarry Smith   PetscErrorCode ierr;
249d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
250d50806bdSBarry Smith   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
251d50806bdSBarry Smith   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
252d50806bdSBarry Smith   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
25338baddfdSBarry Smith   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
254c58ca2e3SHong Zhang   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
255519eb980SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
256aa1aec99SHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
257aa1aec99SHong Zhang   PetscScalar    *ab_dense;
2586718818eSStefano Zampini   PetscContainer cab_dense;
259d50806bdSBarry Smith 
260d50806bdSBarry Smith   PetscFunctionBegin;
261aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
262854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
263aa1aec99SHong Zhang     c->a      = ca;
264aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2656718818eSStefano Zampini   } else ca = c->a;
2666718818eSStefano Zampini 
2676718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2686718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2696718818eSStefano Zampini      that is hard to eradicate) */
2706718818eSStefano Zampini   ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr);
2716718818eSStefano Zampini   if (!cab_dense) {
2726718818eSStefano Zampini     ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
2736718818eSStefano Zampini     ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr);
2746718818eSStefano Zampini     ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr);
2756718818eSStefano Zampini     ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr);
2766718818eSStefano Zampini     ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr);
2776718818eSStefano Zampini     ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr);
278d90d86d0SMatthew G. Knepley   }
2796718818eSStefano Zampini   ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr);
2806718818eSStefano Zampini   ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr);
281aa1aec99SHong Zhang 
282c124e916SHong Zhang   /* clean old values in C */
283580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
284d50806bdSBarry Smith   /* Traverse A row-wise. */
285d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
286d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
287d50806bdSBarry Smith   for (i=0; i<am; i++) {
288d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
289d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
290519eb980SHong Zhang       brow = aj[j];
291d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
292d50806bdSBarry Smith       bjj  = bj + bi[brow];
293d50806bdSBarry Smith       baj  = ba + bi[brow];
294519eb980SHong Zhang       /* perform dense axpy */
29536ec6d2dSHong Zhang       valtmp = aa[j];
296519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
29736ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
298519eb980SHong Zhang       }
299519eb980SHong Zhang       flops += 2*bnzi;
300519eb980SHong Zhang     }
301c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
302c58ca2e3SHong Zhang 
303c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
304519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
305519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
306519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
307519eb980SHong Zhang     }
308519eb980SHong Zhang     flops += cnzi;
309519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
310519eb980SHong Zhang   }
311c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
312c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
313c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
314c58ca2e3SHong Zhang   PetscFunctionReturn(0);
315c58ca2e3SHong Zhang }
316c58ca2e3SHong Zhang 
31725023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
318c58ca2e3SHong Zhang {
319c58ca2e3SHong Zhang   PetscErrorCode ierr;
320c58ca2e3SHong Zhang   PetscLogDouble flops=0.0;
321c58ca2e3SHong Zhang   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
322c58ca2e3SHong Zhang   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
323c58ca2e3SHong Zhang   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
324c58ca2e3SHong Zhang   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
325c58ca2e3SHong Zhang   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
326c58ca2e3SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
32736ec6d2dSHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
328c58ca2e3SHong Zhang   PetscInt       nextb;
329c58ca2e3SHong Zhang 
330c58ca2e3SHong Zhang   PetscFunctionBegin;
331cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
332cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
333cf742fccSHong Zhang     c->a      = ca;
334cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
335cf742fccSHong Zhang   }
336cf742fccSHong Zhang 
337c58ca2e3SHong Zhang   /* clean old values in C */
338580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
339c58ca2e3SHong Zhang   /* Traverse A row-wise. */
340c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
341c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
342519eb980SHong Zhang   for (i=0; i<am; i++) {
343519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
344519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
345519eb980SHong Zhang     for (j=0; j<anzi; j++) {
346519eb980SHong Zhang       brow = aj[j];
347519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
348519eb980SHong Zhang       bjj  = bj + bi[brow];
349519eb980SHong Zhang       baj  = ba + bi[brow];
350519eb980SHong Zhang       /* perform sparse axpy */
35136ec6d2dSHong Zhang       valtmp = aa[j];
352c124e916SHong Zhang       nextb  = 0;
353c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
354c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
35536ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
356c124e916SHong Zhang         }
357d50806bdSBarry Smith       }
358d50806bdSBarry Smith       flops += 2*bnzi;
359d50806bdSBarry Smith     }
360519eb980SHong Zhang     aj += anzi; aa += anzi;
361519eb980SHong Zhang     cj += cnzi; ca += cnzi;
362d50806bdSBarry Smith   }
363716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
364716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
365d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
366d50806bdSBarry Smith   PetscFunctionReturn(0);
367d50806bdSBarry Smith }
368bc011b1eSHong Zhang 
3694222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
37025296bd5SBarry Smith {
37125296bd5SBarry Smith   PetscErrorCode     ierr;
37225296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
37325296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3743c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
37525296bd5SBarry Smith   MatScalar          *ca;
37625296bd5SBarry Smith   PetscReal          afill;
377eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
378eca6b86aSHong Zhang   PetscTable         ta;
3790298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
38025296bd5SBarry Smith 
38125296bd5SBarry Smith   PetscFunctionBegin;
3823c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3833c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3843c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
385854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
3863c50cad2SHong Zhang   ci[0] = 0;
3873c50cad2SHong Zhang 
3883c50cad2SHong Zhang   /* create and initialize a linked list */
389c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
390c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
391eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
392eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
393eca6b86aSHong Zhang 
394eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
3953c50cad2SHong Zhang 
3963c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
397f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
3983c50cad2SHong Zhang   current_space = free_space;
3993c50cad2SHong Zhang 
4003c50cad2SHong Zhang   /* Determine ci and cj */
4013c50cad2SHong Zhang   for (i=0; i<am; i++) {
4023c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
4033c50cad2SHong Zhang     aj   = a->j + ai[i];
4043c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
4053c50cad2SHong Zhang       brow = aj[j];
4063c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
4073c50cad2SHong Zhang       bj   = b->j + bi[brow];
4083c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4093c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
4103c50cad2SHong Zhang     }
411*8c78258cSHong Zhang     /* add possible missing diagonal entry */
412*8c78258cSHong Zhang     if (C->force_diagonals) {
413*8c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr);
414*8c78258cSHong Zhang     }
4153c50cad2SHong Zhang     cnzi = lnk[1];
4163c50cad2SHong Zhang 
4173c50cad2SHong Zhang     /* If free space is not available, make more free space */
4183c50cad2SHong Zhang     /* Double the amount of total space in the list */
4193c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
420f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
4213c50cad2SHong Zhang       ndouble++;
4223c50cad2SHong Zhang     }
4233c50cad2SHong Zhang 
4243c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4253c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4262205254eSKarl Rupp 
4273c50cad2SHong Zhang     current_space->array           += cnzi;
4283c50cad2SHong Zhang     current_space->local_used      += cnzi;
4293c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4302205254eSKarl Rupp 
4313c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
4323c50cad2SHong Zhang   }
4333c50cad2SHong Zhang 
4343c50cad2SHong Zhang   /* Column indices are in the list of free space */
4353c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4363c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
437854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
4383c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
4393c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
44025296bd5SBarry Smith 
44125296bd5SBarry Smith   /* Allocate space for ca */
442580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
44325296bd5SBarry Smith 
44425296bd5SBarry Smith   /* put together the new symbolic matrix */
445e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
4464222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
44725296bd5SBarry Smith 
44825296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
44925296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4504222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
45125296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
45225296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
45325296bd5SBarry Smith   c->nonew   = 0;
4542205254eSKarl Rupp 
4554222ddf1SHong Zhang   /* slower, less memory */
4564222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
45725296bd5SBarry Smith 
45825296bd5SBarry Smith   /* set MatInfo */
45925296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
46025296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
46125296bd5SBarry Smith   c->maxnz                     = ci[am];
46225296bd5SBarry Smith   c->nz                        = ci[am];
4634222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4644222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4654222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
46625296bd5SBarry Smith 
46725296bd5SBarry Smith #if defined(PETSC_USE_INFO)
46825296bd5SBarry Smith   if (ci[am]) {
4694222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
4704222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
47125296bd5SBarry Smith   } else {
4724222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
47325296bd5SBarry Smith   }
47425296bd5SBarry Smith #endif
47525296bd5SBarry Smith   PetscFunctionReturn(0);
47625296bd5SBarry Smith }
47725296bd5SBarry Smith 
4784222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
479e9e4536cSHong Zhang {
480e9e4536cSHong Zhang   PetscErrorCode     ierr;
481e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
482bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
48325c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
484e9e4536cSHong Zhang   MatScalar          *ca;
485e9e4536cSHong Zhang   PetscReal          afill;
486eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
487eca6b86aSHong Zhang   PetscTable         ta;
4880298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
489e9e4536cSHong Zhang 
490e9e4536cSHong Zhang   PetscFunctionBegin;
491bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
492bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
493bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
494854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
495bd958071SHong Zhang   ci[0] = 0;
496bd958071SHong Zhang 
497bd958071SHong Zhang   /* create and initialize a linked list */
498c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
499c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
500eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
501eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
502eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
503bd958071SHong Zhang 
504bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
505f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
506bd958071SHong Zhang   current_space = free_space;
507bd958071SHong Zhang 
508bd958071SHong Zhang   /* Determine ci and cj */
509bd958071SHong Zhang   for (i=0; i<am; i++) {
510bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
511bd958071SHong Zhang     aj   = a->j + ai[i];
512bd958071SHong Zhang     for (j=0; j<anzi; j++) {
513bd958071SHong Zhang       brow = aj[j];
514bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
515bd958071SHong Zhang       bj   = b->j + bi[brow];
516bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
517bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
518bd958071SHong Zhang     }
519*8c78258cSHong Zhang     /* add possible missing diagonal entry */
520*8c78258cSHong Zhang     if (C->force_diagonals) {
521*8c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr);
522*8c78258cSHong Zhang     }
523*8c78258cSHong Zhang 
524bd958071SHong Zhang     cnzi = lnk[0];
525bd958071SHong Zhang 
526bd958071SHong Zhang     /* If free space is not available, make more free space */
527bd958071SHong Zhang     /* Double the amount of total space in the list */
528bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
529f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
530bd958071SHong Zhang       ndouble++;
531bd958071SHong Zhang     }
532bd958071SHong Zhang 
533bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
534bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
5352205254eSKarl Rupp 
536bd958071SHong Zhang     current_space->array           += cnzi;
537bd958071SHong Zhang     current_space->local_used      += cnzi;
538bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5392205254eSKarl Rupp 
540bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
541bd958071SHong Zhang   }
542bd958071SHong Zhang 
543bd958071SHong Zhang   /* Column indices are in the list of free space */
544bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
545bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
546854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
547bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
548bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
549e9e4536cSHong Zhang 
550e9e4536cSHong Zhang   /* Allocate space for ca */
551bd958071SHong Zhang   /*-----------------------*/
552580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
553e9e4536cSHong Zhang 
554e9e4536cSHong Zhang   /* put together the new symbolic matrix */
555e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
5564222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
557e9e4536cSHong Zhang 
558e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
559e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5604222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
561e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
562e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
563e9e4536cSHong Zhang   c->nonew   = 0;
5642205254eSKarl Rupp 
5654222ddf1SHong Zhang   /* slower, less memory */
5664222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
567e9e4536cSHong Zhang 
568e9e4536cSHong Zhang   /* set MatInfo */
569e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
570e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
571e9e4536cSHong Zhang   c->maxnz                     = ci[am];
572e9e4536cSHong Zhang   c->nz                        = ci[am];
5734222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5744222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5754222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
576e9e4536cSHong Zhang 
577e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
578e9e4536cSHong Zhang   if (ci[am]) {
5794222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
5804222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
581e9e4536cSHong Zhang   } else {
5824222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
583e9e4536cSHong Zhang   }
584e9e4536cSHong Zhang #endif
585e9e4536cSHong Zhang   PetscFunctionReturn(0);
586e9e4536cSHong Zhang }
587e9e4536cSHong Zhang 
5884222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
5890ced3a2bSJed Brown {
5900ced3a2bSJed Brown   PetscErrorCode     ierr;
5910ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
5920ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
5930ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
5940ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
5950ced3a2bSJed Brown   PetscReal          afill;
5960ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
5970298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
5980ced3a2bSJed Brown   PetscHeap          h;
5990ced3a2bSJed Brown 
6000ced3a2bSJed Brown   PetscFunctionBegin;
601cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6020ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
6030ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
604854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6050ced3a2bSJed Brown   ci[0] = 0;
6060ced3a2bSJed Brown 
6070ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
608f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6090ced3a2bSJed Brown   current_space = free_space;
6100ced3a2bSJed Brown 
6110ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
612785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6130ced3a2bSJed Brown 
6140ced3a2bSJed Brown   /* Determine ci and cj */
6150ced3a2bSJed Brown   for (i=0; i<am; i++) {
6160ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6170ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6180ced3a2bSJed Brown     ci[i+1] = ci[i];
6190ced3a2bSJed Brown     /* Populate the min heap */
6200ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
6210ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
6220ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
6230ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
6240ced3a2bSJed Brown       }
6250ced3a2bSJed Brown     }
6260ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6270ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6280ced3a2bSJed Brown     while (j >= 0) {
6290ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
630f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6310ced3a2bSJed Brown         ndouble++;
6320ced3a2bSJed Brown       }
6330ced3a2bSJed Brown       *(current_space->array++) = col;
6340ced3a2bSJed Brown       current_space->local_used++;
6350ced3a2bSJed Brown       current_space->local_remaining--;
6360ced3a2bSJed Brown       ci[i+1]++;
6370ced3a2bSJed Brown 
6380ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6390ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
6400ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
6410ced3a2bSJed Brown         PetscInt j2,col2;
6420ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
6430ced3a2bSJed Brown         if (col2 != col) break;
6440ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
6450ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
6460ced3a2bSJed Brown       }
6470ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6480ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
6490ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6500ced3a2bSJed Brown     }
6510ced3a2bSJed Brown   }
6520ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6530ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
6540ced3a2bSJed Brown 
6550ced3a2bSJed Brown   /* Column indices are in the list of free space */
6560ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6570ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
658785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
6590ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
6600ced3a2bSJed Brown 
6610ced3a2bSJed Brown   /* put together the new symbolic matrix */
662e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
6634222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
6640ced3a2bSJed Brown 
6650ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6660ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6674222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
6680ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6690ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6700ced3a2bSJed Brown   c->nonew   = 0;
67126fbe8dcSKarl Rupp 
6724222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6730ced3a2bSJed Brown 
6740ced3a2bSJed Brown   /* set MatInfo */
6750ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6760ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6770ced3a2bSJed Brown   c->maxnz                     = ci[am];
6780ced3a2bSJed Brown   c->nz                        = ci[am];
6794222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6804222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6814222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6820ced3a2bSJed Brown 
6830ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6840ced3a2bSJed Brown   if (ci[am]) {
6854222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
6864222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
6870ced3a2bSJed Brown   } else {
6884222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
6890ced3a2bSJed Brown   }
6900ced3a2bSJed Brown #endif
6910ced3a2bSJed Brown   PetscFunctionReturn(0);
6920ced3a2bSJed Brown }
693e9e4536cSHong Zhang 
6944222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
6958a07c6f1SJed Brown {
6968a07c6f1SJed Brown   PetscErrorCode     ierr;
6978a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6988a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6998a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
7008a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
7018a07c6f1SJed Brown   PetscReal          afill;
7028a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
7030298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
7048a07c6f1SJed Brown   PetscHeap          h;
7058a07c6f1SJed Brown   PetscBT            bt;
7068a07c6f1SJed Brown 
7078a07c6f1SJed Brown   PetscFunctionBegin;
708cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7098a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
7108a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
711854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
7128a07c6f1SJed Brown   ci[0] = 0;
7138a07c6f1SJed Brown 
7148a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
715f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
7162205254eSKarl Rupp 
7178a07c6f1SJed Brown   current_space = free_space;
7188a07c6f1SJed Brown 
7198a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
720785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
7218a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
7228a07c6f1SJed Brown 
7238a07c6f1SJed Brown   /* Determine ci and cj */
7248a07c6f1SJed Brown   for (i=0; i<am; i++) {
7258a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7268a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
7278a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7288a07c6f1SJed Brown     ci[i+1] = ci[i];
7298a07c6f1SJed Brown     /* Populate the min heap */
7308a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
7318a07c6f1SJed Brown       PetscInt brow = acol[j];
7328a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
7338a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7348a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7358a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7368a07c6f1SJed Brown           bb[j]++;
7378a07c6f1SJed Brown           break;
7388a07c6f1SJed Brown         }
7398a07c6f1SJed Brown       }
7408a07c6f1SJed Brown     }
7418a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7428a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7438a07c6f1SJed Brown     while (j >= 0) {
7448a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7450298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
746f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
7478a07c6f1SJed Brown         ndouble++;
7488a07c6f1SJed Brown       }
7498a07c6f1SJed Brown       *(current_space->array++) = col;
7508a07c6f1SJed Brown       current_space->local_used++;
7518a07c6f1SJed Brown       current_space->local_remaining--;
7528a07c6f1SJed Brown       ci[i+1]++;
7538a07c6f1SJed Brown 
7548a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7558a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
7568a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7578a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7588a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7598a07c6f1SJed Brown           bb[j]++;
7608a07c6f1SJed Brown           break;
7618a07c6f1SJed Brown         }
7628a07c6f1SJed Brown       }
7638a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7648a07c6f1SJed Brown     }
7658a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7668a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7678a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7688a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7698a07c6f1SJed Brown     }
7708a07c6f1SJed Brown   }
7718a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
7728a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7738a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7748a07c6f1SJed Brown 
7758a07c6f1SJed Brown   /* Column indices are in the list of free space */
7768a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7778a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
778785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7798a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7808a07c6f1SJed Brown 
7818a07c6f1SJed Brown   /* put together the new symbolic matrix */
782e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
7834222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
7848a07c6f1SJed Brown 
7858a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7868a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7874222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
7888a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7898a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7908a07c6f1SJed Brown   c->nonew   = 0;
79126fbe8dcSKarl Rupp 
7924222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7938a07c6f1SJed Brown 
7948a07c6f1SJed Brown   /* set MatInfo */
7958a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7968a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7978a07c6f1SJed Brown   c->maxnz                     = ci[am];
7988a07c6f1SJed Brown   c->nz                        = ci[am];
7994222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8004222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8014222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8028a07c6f1SJed Brown 
8038a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8048a07c6f1SJed Brown   if (ci[am]) {
8054222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
8064222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
8078a07c6f1SJed Brown   } else {
8084222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
8098a07c6f1SJed Brown   }
8108a07c6f1SJed Brown #endif
8118a07c6f1SJed Brown   PetscFunctionReturn(0);
8128a07c6f1SJed Brown }
8138a07c6f1SJed Brown 
814d7ed1a4dSandi selinger 
8154222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
816d7ed1a4dSandi selinger {
817d7ed1a4dSandi selinger   PetscErrorCode     ierr;
818d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
819d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
820d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
821d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
822d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
823d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
824d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
825d7ed1a4dSandi selinger   PetscInt           window[8];
826d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
827d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
828d7ed1a4dSandi selinger   PetscReal          afill;
829f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
8307660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
831d7ed1a4dSandi selinger 
832d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
833d7ed1a4dSandi selinger              Because of the way virtual memory works,
834d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
835d7ed1a4dSandi selinger   PetscFunctionBegin;
836d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
837d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
838d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
839d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
840d7ed1a4dSandi selinger     a_rownnz = 0;
841d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
842d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
843d7ed1a4dSandi selinger       if (a_rownnz > bn) {
844d7ed1a4dSandi selinger         a_rownnz = bn;
845d7ed1a4dSandi selinger         break;
846d7ed1a4dSandi selinger       }
847d7ed1a4dSandi selinger     }
848d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
849d7ed1a4dSandi selinger   }
850d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
851d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
852f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
853f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
854d7ed1a4dSandi selinger 
8557660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8567660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
857d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
858d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
859d7ed1a4dSandi selinger 
860d7ed1a4dSandi selinger   ci_nnz       = 0;
861d7ed1a4dSandi selinger   ci[0]        = 0;
862d7ed1a4dSandi selinger   worki_L1[0]  = 0;
863d7ed1a4dSandi selinger   worki_L2[0]  = 0;
864d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
865d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
866d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
867d7ed1a4dSandi selinger     rowsleft             = anzi;
868d7ed1a4dSandi selinger     inputcol_L1          = acol;
869d7ed1a4dSandi selinger     L2_nnz               = 0;
8707660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8717660c4dbSandi selinger     worki_L2[1]          = 0;
87208fe1b3cSKarl Rupp     outputi_nnz          = 0;
873d7ed1a4dSandi selinger 
874d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
875d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
876d7ed1a4dSandi selinger       c_maxmem *= 2;
877d7ed1a4dSandi selinger       ndouble++;
878d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
879d7ed1a4dSandi selinger     }
880d7ed1a4dSandi selinger 
881d7ed1a4dSandi selinger     while (rowsleft) {
882d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
883d7ed1a4dSandi selinger       L1_nrows    = 0;
8847660c4dbSandi selinger       L1_nnz      = 0;
885d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
886d7ed1a4dSandi selinger       inputi      = bi;
887d7ed1a4dSandi selinger       inputj      = bj;
888d7ed1a4dSandi selinger 
889d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
890d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
891f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
892d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
893d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
894d7ed1a4dSandi selinger          window_min  = bn;                                                   \
8957660c4dbSandi selinger          outputi_nnz = 0;                                                    \
8967660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
897d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
898d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
899d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
900d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
901d7ed1a4dSandi selinger          }                                                                   \
902d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
903d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
904d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
905d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
906d7ed1a4dSandi selinger            window_min = bn;                                                  \
907d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
908d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
909d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
910d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
911d7ed1a4dSandi selinger              }                                                               \
912d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
913d7ed1a4dSandi selinger            }                                                                 \
914d7ed1a4dSandi selinger          }
915d7ed1a4dSandi selinger 
916d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
917d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
918d7ed1a4dSandi selinger       while (L1_rowsleft) {
9197660c4dbSandi selinger         outputi_nnz = 0;
9207660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
9217660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
9227660c4dbSandi selinger 
923d7ed1a4dSandi selinger         switch (L1_rowsleft) {
924d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
925d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
926d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
927d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
928d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
929d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
930d7ed1a4dSandi selinger                  break;
931d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
932d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
933d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
934d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
935d7ed1a4dSandi selinger                  break;
936d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
937d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
938d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
939d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
940d7ed1a4dSandi selinger                  break;
941d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
942d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
943d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
944d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
945d7ed1a4dSandi selinger                  break;
946d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
947d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
948d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
949d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
950d7ed1a4dSandi selinger                  break;
951d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
952d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
953d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
954d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
955d7ed1a4dSandi selinger                  break;
956d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
957d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
958d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
959d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
960d7ed1a4dSandi selinger                  break;
961d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
962d7ed1a4dSandi selinger                  inputcol    += 8;
963d7ed1a4dSandi selinger                  rowsleft    -= 8;
964d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
965d7ed1a4dSandi selinger                  break;
966d7ed1a4dSandi selinger         }
967d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9687660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9697660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
970d7ed1a4dSandi selinger       }
971d7ed1a4dSandi selinger 
972d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
973d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
974d7ed1a4dSandi selinger       if (anzi > 8) {
975d7ed1a4dSandi selinger         inputi      = worki_L1;
976d7ed1a4dSandi selinger         inputj      = workj_L1;
977d7ed1a4dSandi selinger         inputcol    = workcol;
978d7ed1a4dSandi selinger         outputi_nnz = 0;
979d7ed1a4dSandi selinger 
980d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
981d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
982d7ed1a4dSandi selinger 
983d7ed1a4dSandi selinger         switch (L1_nrows) {
984d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
985d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
986d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
987d7ed1a4dSandi selinger                  break;
988d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
989d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
990d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
991d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
992d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
993d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
994d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
995d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
996d7ed1a4dSandi selinger         }
997d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
998d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
999d7ed1a4dSandi selinger 
10007660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10017660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1002d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1003d7ed1a4dSandi selinger           inputi      = worki_L2;
1004d7ed1a4dSandi selinger           inputj      = workj_L2;
1005d7ed1a4dSandi selinger           inputcol    = workcol;
1006d7ed1a4dSandi selinger           outputi_nnz = 0;
1007f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1008d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
1009d7ed1a4dSandi selinger           switch (L2_nrows) {
1010d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1011d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1012d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1013d7ed1a4dSandi selinger                    break;
1014d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1015d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1016d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1017d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1018d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1019d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1020d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1021d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1022d7ed1a4dSandi selinger           }
1023d7ed1a4dSandi selinger           L2_nrows    = 1;
10247660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10257660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10267660c4dbSandi selinger           /* Copy to workj_L2 */
1027d7ed1a4dSandi selinger           if (rowsleft) {
10287660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1029d7ed1a4dSandi selinger           }
1030d7ed1a4dSandi selinger         }
1031d7ed1a4dSandi selinger       }
1032d7ed1a4dSandi selinger     }  /* while (rowsleft) */
1033d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1034d7ed1a4dSandi selinger 
1035d7ed1a4dSandi selinger     /* terminate current row */
1036d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1037d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
1038d7ed1a4dSandi selinger   }
1039d7ed1a4dSandi selinger 
1040d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
1041e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
10424222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1043d7ed1a4dSandi selinger 
1044d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1045d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10464222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1047d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1048d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1049d7ed1a4dSandi selinger   c->nonew   = 0;
1050d7ed1a4dSandi selinger 
10514222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1052d7ed1a4dSandi selinger 
1053d7ed1a4dSandi selinger   /* set MatInfo */
1054d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1055d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
1056d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
1057d7ed1a4dSandi selinger   c->nz                        = ci[am];
10584222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10594222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10604222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1061d7ed1a4dSandi selinger 
1062d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1063d7ed1a4dSandi selinger   if (ci[am]) {
10644222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
10654222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1066d7ed1a4dSandi selinger   } else {
10674222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1068d7ed1a4dSandi selinger   }
1069d7ed1a4dSandi selinger #endif
1070d7ed1a4dSandi selinger 
1071d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1072d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1073d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1074f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1075d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1076d7ed1a4dSandi selinger }
1077d7ed1a4dSandi selinger 
1078cd093f1dSJed Brown /* concatenate unique entries and then sort */
10794222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1080cd093f1dSJed Brown {
1081cd093f1dSJed Brown   PetscErrorCode ierr;
1082cd093f1dSJed Brown   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1083cd093f1dSJed Brown   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1084*8c78258cSHong Zhang   PetscInt       *ci,*cj,bcol;
1085cd093f1dSJed Brown   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1086cd093f1dSJed Brown   PetscReal      afill;
1087cd093f1dSJed Brown   PetscInt       i,j,ndouble = 0;
1088cd093f1dSJed Brown   PetscSegBuffer seg,segrow;
1089cd093f1dSJed Brown   char           *seen;
1090cd093f1dSJed Brown 
1091cd093f1dSJed Brown   PetscFunctionBegin;
1092854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1093cd093f1dSJed Brown   ci[0] = 0;
1094cd093f1dSJed Brown 
1095cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1096cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1097cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1098580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1099cd093f1dSJed Brown 
1100cd093f1dSJed Brown   /* Determine ci and cj */
1101cd093f1dSJed Brown   for (i=0; i<am; i++) {
1102cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1103cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1104cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1105*8c78258cSHong Zhang 
1106cd093f1dSJed Brown     /* Pack segrow */
1107cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1108cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1109cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
1110*8c78258cSHong Zhang         bcol = bj[k];
1111cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1112cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1113cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1114cd093f1dSJed Brown           *slot = bcol;
1115cd093f1dSJed Brown           seen[bcol] = 1;
1116cd093f1dSJed Brown           packlen++;
1117cd093f1dSJed Brown         }
1118cd093f1dSJed Brown       }
1119cd093f1dSJed Brown     }
1120*8c78258cSHong Zhang 
1121*8c78258cSHong Zhang     /* Check i-th diagonal entry */
1122*8c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
1123*8c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
1124*8c78258cSHong Zhang       ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1125*8c78258cSHong Zhang       *slot   = i;
1126*8c78258cSHong Zhang       seen[i] = 1;
1127*8c78258cSHong Zhang       packlen++;
1128*8c78258cSHong Zhang     }
1129*8c78258cSHong Zhang 
1130cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1131cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1132cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1133cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1134cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1135cd093f1dSJed Brown   }
1136cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1137cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1138cd093f1dSJed Brown 
1139cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1140cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1141cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1142cd093f1dSJed Brown 
1143cd093f1dSJed Brown   /* put together the new symbolic matrix */
1144e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
11454222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1146cd093f1dSJed Brown 
1147cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1148cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11494222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1150cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1151cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1152cd093f1dSJed Brown   c->nonew   = 0;
1153cd093f1dSJed Brown 
11544222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1155cd093f1dSJed Brown 
1156cd093f1dSJed Brown   /* set MatInfo */
11572a09556fSStefano Zampini   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1158cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1159cd093f1dSJed Brown   c->maxnz                  = ci[am];
1160cd093f1dSJed Brown   c->nz                     = ci[am];
11614222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11624222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11634222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1164cd093f1dSJed Brown 
1165cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1166cd093f1dSJed Brown   if (ci[am]) {
11674222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
11684222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1169cd093f1dSJed Brown   } else {
11704222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1171cd093f1dSJed Brown   }
1172cd093f1dSJed Brown #endif
1173cd093f1dSJed Brown   PetscFunctionReturn(0);
1174cd093f1dSJed Brown }
1175cd093f1dSJed Brown 
11766718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
11772128a86cSHong Zhang {
11782128a86cSHong Zhang   PetscErrorCode      ierr;
11796718818eSStefano Zampini   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
11802128a86cSHong Zhang 
11812128a86cSHong Zhang   PetscFunctionBegin;
118240192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
118340192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
118440192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
118540192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
11862128a86cSHong Zhang   PetscFunctionReturn(0);
11872128a86cSHong Zhang }
11882128a86cSHong Zhang 
11894222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1190bc011b1eSHong Zhang {
11915df89d91SHong Zhang   PetscErrorCode      ierr;
119281d82fe4SHong Zhang   Mat                 Bt;
119381d82fe4SHong Zhang   PetscInt            *bti,*btj;
119440192850SHong Zhang   Mat_MatMatTransMult *abt;
11954222ddf1SHong Zhang   Mat_Product         *product = C->product;
11966718818eSStefano Zampini   char                *alg;
1197d2853540SHong Zhang 
119881d82fe4SHong Zhang   PetscFunctionBegin;
11996718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
12006718818eSStefano Zampini   if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
12016718818eSStefano Zampini 
120281d82fe4SHong Zhang   /* create symbolic Bt */
120381d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12040298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
120533d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
120604b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
120781d82fe4SHong Zhang 
120881d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12096718818eSStefano Zampini   ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr);
12104222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
121181d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
12124222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
12136718818eSStefano Zampini   ierr = PetscFree(alg);CHKERRQ(ierr);
121481d82fe4SHong Zhang 
12152128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1216b00a9115SJed Brown   ierr = PetscNew(&abt);CHKERRQ(ierr);
12172128a86cSHong Zhang 
12186718818eSStefano Zampini   product->data    = abt;
12196718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12206718818eSStefano Zampini 
12214222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12222128a86cSHong Zhang 
12234222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12244222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
122540192850SHong Zhang   if (abt->usecoloring) {
1226b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1227b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1228335efc43SPeter Brune     MatColoring          coloring;
12292128a86cSHong Zhang     ISColoring           iscoloring;
12302128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
1231e8354b3eSHong Zhang 
12324222ddf1SHong Zhang     /* inode causes memory problem */
12334222ddf1SHong Zhang     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
12344222ddf1SHong Zhang 
12354222ddf1SHong Zhang     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1236335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1237335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1238335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1239335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1240335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
12414222ddf1SHong Zhang     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
12422205254eSKarl Rupp 
124340192850SHong Zhang     abt->matcoloring = matcoloring;
12442205254eSKarl Rupp 
12452128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
12462128a86cSHong Zhang 
12472128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12482128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
12492128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12502128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
12510298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
12522205254eSKarl Rupp 
12532128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
125440192850SHong Zhang     abt->Bt_den         = Bt_dense;
12552128a86cSHong Zhang 
12562128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
12572128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12582128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
12590298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
12602205254eSKarl Rupp 
12612128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
126240192850SHong Zhang     abt->ABt_den  = C_dense;
1263f94ccd6cSHong Zhang 
1264f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12651ce71dffSSatish Balay     {
12664222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
12674222ddf1SHong Zhang       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
12681ce71dffSSatish Balay     }
1269f94ccd6cSHong Zhang #endif
12702128a86cSHong Zhang   }
1271e99be685SHong Zhang   /* clean up */
1272e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1273e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12745df89d91SHong Zhang   PetscFunctionReturn(0);
12755df89d91SHong Zhang }
12765df89d91SHong Zhang 
12776fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12785df89d91SHong Zhang {
12795df89d91SHong Zhang   PetscErrorCode      ierr;
12805df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1281e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
128281d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12835df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1284aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
12856718818eSStefano Zampini   Mat_MatMatTransMult *abt;
12866718818eSStefano Zampini   Mat_Product         *product = C->product;
12875df89d91SHong Zhang 
12885df89d91SHong Zhang   PetscFunctionBegin;
12896718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
12906718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
12916718818eSStefano Zampini   if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
129258ed3ceaSHong Zhang   /* clear old values in C */
129358ed3ceaSHong Zhang   if (!c->a) {
1294580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
129558ed3ceaSHong Zhang     c->a      = ca;
129658ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
129758ed3ceaSHong Zhang   } else {
129858ed3ceaSHong Zhang     ca =  c->a;
1299580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
130058ed3ceaSHong Zhang   }
130158ed3ceaSHong Zhang 
130240192850SHong Zhang   if (abt->usecoloring) {
130340192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
130440192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1305c8db22f6SHong Zhang 
1306b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
130740192850SHong Zhang     Bt_dense = abt->Bt_den;
1308b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1309c8db22f6SHong Zhang 
1310c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13112128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1312c8db22f6SHong Zhang 
13132128a86cSHong Zhang     /* Recover C from C_dense */
1314b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1315c8db22f6SHong Zhang     PetscFunctionReturn(0);
1316c8db22f6SHong Zhang   }
1317c8db22f6SHong Zhang 
13181fa1209cSHong Zhang   for (i=0; i<cm; i++) {
131981d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
132081d82fe4SHong Zhang     acol = aj + ai[i];
13216973769bSHong Zhang     aval = aa + ai[i];
13221fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
13231fa1209cSHong Zhang     ccol = cj + ci[i];
13246973769bSHong Zhang     cval = ca + ci[i];
13251fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
132681d82fe4SHong Zhang       brow = ccol[j];
132781d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
132881d82fe4SHong Zhang       bcol = bj + bi[brow];
13296973769bSHong Zhang       bval = ba + bi[brow];
13306973769bSHong Zhang 
133181d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
133281d82fe4SHong Zhang       nexta = 0; nextb = 0;
133381d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
13347b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
133581d82fe4SHong Zhang         if (nexta == anzi) break;
13367b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
133781d82fe4SHong Zhang         if (nextb == bnzj) break;
133881d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13396973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
134081d82fe4SHong Zhang           nexta++; nextb++;
134181d82fe4SHong Zhang           flops += 2;
13421fa1209cSHong Zhang         }
13431fa1209cSHong Zhang       }
134481d82fe4SHong Zhang     }
134581d82fe4SHong Zhang   }
134681d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
134781d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
134881d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
13495df89d91SHong Zhang   PetscFunctionReturn(0);
13505df89d91SHong Zhang }
13515df89d91SHong Zhang 
13526718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
13536d373c3eSHong Zhang {
13546d373c3eSHong Zhang   PetscErrorCode      ierr;
13556718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
13566d373c3eSHong Zhang 
13576d373c3eSHong Zhang   PetscFunctionBegin;
13586d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
13596718818eSStefano Zampini   if (atb->destroy) {
13606718818eSStefano Zampini     ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr);
13616473ade5SStefano Zampini   }
13626d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
13636d373c3eSHong Zhang   PetscFunctionReturn(0);
13646d373c3eSHong Zhang }
13656d373c3eSHong Zhang 
13664222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
13675df89d91SHong Zhang {
1368bc011b1eSHong Zhang   PetscErrorCode ierr;
1369089a957eSStefano Zampini   Mat            At = NULL;
137038baddfdSBarry Smith   PetscInt       *ati,*atj;
13714222ddf1SHong Zhang   Mat_Product    *product = C->product;
1372089a957eSStefano Zampini   PetscBool      flg,def,square;
1373bc011b1eSHong Zhang 
1374bc011b1eSHong Zhang   PetscFunctionBegin;
1375089a957eSStefano Zampini   MatCheckProduct(C,4);
1376089a957eSStefano Zampini   square = (PetscBool)(A == B && A->symmetric && A->symmetric_set);
13774222ddf1SHong Zhang   /* outerproduct */
1378089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr);
13794222ddf1SHong Zhang   if (flg) {
1380bc011b1eSHong Zhang     /* create symbolic At */
1381089a957eSStefano Zampini     if (!square) {
1382bc011b1eSHong Zhang       ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13830298fd71SBarry Smith       ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
138433d57670SJed Brown       ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
138504b858e0SBarry Smith       ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1386089a957eSStefano Zampini     }
1387bc011b1eSHong Zhang     /* get symbolic C=At*B */
13887a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1389089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
1390bc011b1eSHong Zhang 
1391bc011b1eSHong Zhang     /* clean up */
1392089a957eSStefano Zampini     if (!square) {
13936bf464f9SBarry Smith       ierr = MatDestroy(&At);CHKERRQ(ierr);
1394bc011b1eSHong Zhang       ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1395089a957eSStefano Zampini     }
13966d373c3eSHong Zhang 
13974222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
13987a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr);
13994222ddf1SHong Zhang     PetscFunctionReturn(0);
14004222ddf1SHong Zhang   }
14014222ddf1SHong Zhang 
14024222ddf1SHong Zhang   /* matmatmult */
1403089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr);
1404089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr);
1405089a957eSStefano Zampini   if (flg || def) {
14064222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14074222ddf1SHong Zhang 
14086718818eSStefano Zampini     if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
14094222ddf1SHong Zhang     ierr = PetscNew(&atb);CHKERRQ(ierr);
1410089a957eSStefano Zampini     if (!square) {
14114222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1412089a957eSStefano Zampini     }
14137a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1414089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
14156718818eSStefano Zampini     ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr);
14166718818eSStefano Zampini     product->data    = atb;
14176718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14184222ddf1SHong Zhang     atb->At          = At;
14194222ddf1SHong Zhang     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
14204222ddf1SHong Zhang 
14214222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14224222ddf1SHong Zhang     PetscFunctionReturn(0);
14234222ddf1SHong Zhang   }
14244222ddf1SHong Zhang 
14254222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1426bc011b1eSHong Zhang }
1427bc011b1eSHong Zhang 
142875648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1429bc011b1eSHong Zhang {
1430bc011b1eSHong Zhang   PetscErrorCode ierr;
14310fbc74f4SHong Zhang   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1432d0f46423SBarry Smith   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1433d0f46423SBarry Smith   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1434d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1435aa1aec99SHong Zhang   MatScalar      *aa=a->a,*ba,*ca,*caj;
1436bc011b1eSHong Zhang 
1437bc011b1eSHong Zhang   PetscFunctionBegin;
1438aa1aec99SHong Zhang   if (!c->a) {
1439580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
14402205254eSKarl Rupp 
1441aa1aec99SHong Zhang     c->a      = ca;
1442aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1443aa1aec99SHong Zhang   } else {
1444aa1aec99SHong Zhang     ca   = c->a;
1445580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1446aa1aec99SHong Zhang   }
1447bc011b1eSHong Zhang 
1448bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1449bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1450bc011b1eSHong Zhang     bj   = b->j + bi[i];
1451bc011b1eSHong Zhang     ba   = b->a + bi[i];
1452bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1453bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1454bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1455bc011b1eSHong Zhang       nextb = 0;
14560fbc74f4SHong Zhang       crow  = *aj++;
1457bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1458bc011b1eSHong Zhang       caj   = ca + ci[crow];
1459bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1460bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14610fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14620fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1463bc011b1eSHong Zhang           nextb++;
1464bc011b1eSHong Zhang         }
1465bc011b1eSHong Zhang       }
1466bc011b1eSHong Zhang       flops += 2*bnzi;
14670fbc74f4SHong Zhang       aa++;
1468bc011b1eSHong Zhang     }
1469bc011b1eSHong Zhang   }
1470bc011b1eSHong Zhang 
1471bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1472bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1473bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1474bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1475bc011b1eSHong Zhang   PetscFunctionReturn(0);
1476bc011b1eSHong Zhang }
14779123193aSHong Zhang 
14784222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
14799123193aSHong Zhang {
14809123193aSHong Zhang   PetscErrorCode ierr;
14819123193aSHong Zhang 
14829123193aSHong Zhang   PetscFunctionBegin;
14835a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14844222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14859123193aSHong Zhang   PetscFunctionReturn(0);
14869123193aSHong Zhang }
14879123193aSHong Zhang 
148893aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
14899123193aSHong Zhang {
1490f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1491612438f1SStefano Zampini   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
14921ca9667aSStefano Zampini   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
14939123193aSHong Zhang   PetscErrorCode    ierr;
14941ca9667aSStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1495a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1496daf57211SHong Zhang   const PetscInt    *aj;
1497612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
14981ca9667aSStefano Zampini   PetscInt          clda=cd->lda;
14991ca9667aSStefano Zampini   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
15009123193aSHong Zhang 
15019123193aSHong Zhang   PetscFunctionBegin;
1502f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1503a4af7ca8SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
150493aa15f2SStefano Zampini   if (add) {
15058c778c55SBarry Smith     ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
150693aa15f2SStefano Zampini   } else {
150793aa15f2SStefano Zampini     ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr);
150893aa15f2SStefano Zampini   }
1509a4af7ca8SStefano Zampini   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1510f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
15111ca9667aSStefano Zampini   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
15121ca9667aSStefano Zampini   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
15131ca9667aSStefano Zampini     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1514f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1515f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1516f32d5d43SBarry Smith       aj = a->j + a->i[i];
1517a4af7ca8SStefano Zampini       aa = av + a->i[i];
1518f32d5d43SBarry Smith       for (j=0; j<n; j++) {
15191ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15201ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1521730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1522730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1523730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1524730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
15259123193aSHong Zhang       }
152693aa15f2SStefano Zampini       if (add) {
152787753ddeSHong Zhang         c1[i] += r1;
152887753ddeSHong Zhang         c2[i] += r2;
152987753ddeSHong Zhang         c3[i] += r3;
153087753ddeSHong Zhang         c4[i] += r4;
153193aa15f2SStefano Zampini       } else {
153293aa15f2SStefano Zampini         c1[i] = r1;
153393aa15f2SStefano Zampini         c2[i] = r2;
153493aa15f2SStefano Zampini         c3[i] = r3;
153593aa15f2SStefano Zampini         c4[i] = r4;
153693aa15f2SStefano Zampini       }
1537f32d5d43SBarry Smith     }
1538730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1539730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1540f32d5d43SBarry Smith   }
154193aa15f2SStefano Zampini   /* process remaining columns */
154293aa15f2SStefano Zampini   if (col != cn) {
154393aa15f2SStefano Zampini     PetscInt rc = cn-col;
154493aa15f2SStefano Zampini 
154593aa15f2SStefano Zampini     if (rc == 1) {
154693aa15f2SStefano Zampini       for (i=0; i<am; i++) {
1547f32d5d43SBarry Smith         r1 = 0.0;
1548f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1549f32d5d43SBarry Smith         aj = a->j + a->i[i];
1550a4af7ca8SStefano Zampini         aa = av + a->i[i];
155193aa15f2SStefano Zampini         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
155293aa15f2SStefano Zampini         if (add) c1[i] += r1;
155393aa15f2SStefano Zampini         else c1[i] = r1;
155493aa15f2SStefano Zampini       }
155593aa15f2SStefano Zampini     } else if (rc == 2) {
155693aa15f2SStefano Zampini       for (i=0; i<am; i++) {
155793aa15f2SStefano Zampini         r1 = r2 = 0.0;
155893aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
155993aa15f2SStefano Zampini         aj = a->j + a->i[i];
156093aa15f2SStefano Zampini         aa = av + a->i[i];
1561f32d5d43SBarry Smith         for (j=0; j<n; j++) {
156293aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
156393aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
156493aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
156593aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
1566f32d5d43SBarry Smith         }
156793aa15f2SStefano Zampini         if (add) {
156887753ddeSHong Zhang           c1[i] += r1;
156993aa15f2SStefano Zampini           c2[i] += r2;
157093aa15f2SStefano Zampini         } else {
157193aa15f2SStefano Zampini           c1[i] = r1;
157293aa15f2SStefano Zampini           c2[i] = r2;
1573f32d5d43SBarry Smith         }
157493aa15f2SStefano Zampini       }
157593aa15f2SStefano Zampini     } else {
157693aa15f2SStefano Zampini       for (i=0; i<am; i++) {
157793aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
157893aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
157993aa15f2SStefano Zampini         aj = a->j + a->i[i];
158093aa15f2SStefano Zampini         aa = av + a->i[i];
158193aa15f2SStefano Zampini         for (j=0; j<n; j++) {
158293aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
158393aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
158493aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
158593aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
158693aa15f2SStefano Zampini           r3 += aatmp*b3[ajtmp];
158793aa15f2SStefano Zampini         }
158893aa15f2SStefano Zampini         if (add) {
158993aa15f2SStefano Zampini           c1[i] += r1;
159093aa15f2SStefano Zampini           c2[i] += r2;
159193aa15f2SStefano Zampini           c3[i] += r3;
159293aa15f2SStefano Zampini         } else {
159393aa15f2SStefano Zampini           c1[i] = r1;
159493aa15f2SStefano Zampini           c2[i] = r2;
159593aa15f2SStefano Zampini           c3[i] = r3;
159693aa15f2SStefano Zampini         }
159793aa15f2SStefano Zampini       }
159893aa15f2SStefano Zampini     }
1599f32d5d43SBarry Smith   }
1600dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
160193aa15f2SStefano Zampini   if (add) {
16028c778c55SBarry Smith     ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
160393aa15f2SStefano Zampini   } else {
160493aa15f2SStefano Zampini     ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr);
160593aa15f2SStefano Zampini   }
1606a4af7ca8SStefano Zampini   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1607a4af7ca8SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1608f32d5d43SBarry Smith   PetscFunctionReturn(0);
1609f32d5d43SBarry Smith }
1610f32d5d43SBarry Smith 
161187753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1612f32d5d43SBarry Smith {
1613f32d5d43SBarry Smith   PetscErrorCode ierr;
1614f32d5d43SBarry Smith 
1615f32d5d43SBarry Smith   PetscFunctionBegin;
161687753ddeSHong Zhang   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
161787753ddeSHong Zhang   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
161887753ddeSHong Zhang   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
16194324174eSBarry Smith 
162093aa15f2SStefano Zampini   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr);
16219123193aSHong Zhang   PetscFunctionReturn(0);
16229123193aSHong Zhang }
1623b1683b59SHong Zhang 
16244222ddf1SHong Zhang /* ------------------------------------------------------- */
16254222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
16264222ddf1SHong Zhang {
16274222ddf1SHong Zhang   PetscFunctionBegin;
16284222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16294222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16304222ddf1SHong Zhang   PetscFunctionReturn(0);
16314222ddf1SHong Zhang }
16324222ddf1SHong Zhang 
16336718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
16346718818eSStefano Zampini 
16354222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
16364222ddf1SHong Zhang {
16374222ddf1SHong Zhang   PetscFunctionBegin;
16386718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16394222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16406718818eSStefano Zampini   PetscFunctionReturn(0);
16416718818eSStefano Zampini }
16426718818eSStefano Zampini 
16436718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
16446718818eSStefano Zampini {
16456718818eSStefano Zampini   PetscFunctionBegin;
16466718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16476718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16484222ddf1SHong Zhang   PetscFunctionReturn(0);
16494222ddf1SHong Zhang }
16504222ddf1SHong Zhang 
16514222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
16524222ddf1SHong Zhang {
16534222ddf1SHong Zhang   PetscErrorCode ierr;
16544222ddf1SHong Zhang   Mat_Product    *product = C->product;
16554222ddf1SHong Zhang 
16564222ddf1SHong Zhang   PetscFunctionBegin;
16574222ddf1SHong Zhang   switch (product->type) {
16584222ddf1SHong Zhang   case MATPRODUCT_AB:
16594222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
16604222ddf1SHong Zhang     break;
16614222ddf1SHong Zhang   case MATPRODUCT_AtB:
16624222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
16634222ddf1SHong Zhang     break;
16646718818eSStefano Zampini   case MATPRODUCT_ABt:
16656718818eSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr);
16664222ddf1SHong Zhang     break;
16674222ddf1SHong Zhang   default:
16686718818eSStefano Zampini     break;
16694222ddf1SHong Zhang   }
16704222ddf1SHong Zhang   PetscFunctionReturn(0);
16714222ddf1SHong Zhang }
16724222ddf1SHong Zhang /* ------------------------------------------------------- */
16734222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
16744222ddf1SHong Zhang {
16754222ddf1SHong Zhang   PetscErrorCode ierr;
16764222ddf1SHong Zhang   Mat_Product    *product = C->product;
16774222ddf1SHong Zhang   Mat            A = product->A;
16784222ddf1SHong Zhang   PetscBool      baij;
16794222ddf1SHong Zhang 
16804222ddf1SHong Zhang   PetscFunctionBegin;
16814222ddf1SHong Zhang   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
16824222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
16834222ddf1SHong Zhang     PetscBool sbaij;
16844222ddf1SHong Zhang     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
16854222ddf1SHong Zhang     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
16864222ddf1SHong Zhang 
16874222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
16884222ddf1SHong Zhang   } else { /* A is seqbaij */
16894222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
16904222ddf1SHong Zhang   }
16914222ddf1SHong Zhang 
16924222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16934222ddf1SHong Zhang   PetscFunctionReturn(0);
16944222ddf1SHong Zhang }
16954222ddf1SHong Zhang 
16964222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
16974222ddf1SHong Zhang {
16984222ddf1SHong Zhang   PetscErrorCode ierr;
16994222ddf1SHong Zhang   Mat_Product    *product = C->product;
17004222ddf1SHong Zhang 
17014222ddf1SHong Zhang   PetscFunctionBegin;
17026718818eSStefano Zampini   MatCheckProduct(C,1);
17036718818eSStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
17046718818eSStefano Zampini   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
17054222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
17066718818eSStefano Zampini   }
17074222ddf1SHong Zhang   PetscFunctionReturn(0);
17084222ddf1SHong Zhang }
17096718818eSStefano Zampini 
17104222ddf1SHong Zhang /* ------------------------------------------------------- */
17114222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
17124222ddf1SHong Zhang {
17134222ddf1SHong Zhang   PetscFunctionBegin;
17144222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17154222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17164222ddf1SHong Zhang   PetscFunctionReturn(0);
17174222ddf1SHong Zhang }
17184222ddf1SHong Zhang 
17194222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
17204222ddf1SHong Zhang {
17214222ddf1SHong Zhang   PetscErrorCode ierr;
17224222ddf1SHong Zhang   Mat_Product    *product = C->product;
17234222ddf1SHong Zhang 
17244222ddf1SHong Zhang   PetscFunctionBegin;
17254222ddf1SHong Zhang   if (product->type == MATPRODUCT_AB) {
17264222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
17276718818eSStefano Zampini   }
17284222ddf1SHong Zhang   PetscFunctionReturn(0);
17294222ddf1SHong Zhang }
17304222ddf1SHong Zhang /* ------------------------------------------------------- */
17314222ddf1SHong Zhang 
1732b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1733c8db22f6SHong Zhang {
1734c8db22f6SHong Zhang   PetscErrorCode ierr;
17352f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
17362f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
17372f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
17382f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
17392f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
17402f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1741c8db22f6SHong Zhang 
1742c8db22f6SHong Zhang   PetscFunctionBegin;
17432f5f1f90SHong Zhang   btval_den=btdense->v;
1744580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
17452f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
17462f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17472f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1748d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17492f5f1f90SHong Zhang       btcol = bj + bi[col];
17502f5f1f90SHong Zhang       btval = ba + bi[col];
17512f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1752d2853540SHong Zhang       for (j=0; j<anz; j++) {
17532f5f1f90SHong Zhang         brow            = btcol[j];
17542f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1755c8db22f6SHong Zhang       }
1756c8db22f6SHong Zhang     }
17572f5f1f90SHong Zhang     btval_den += m;
1758c8db22f6SHong Zhang   }
1759c8db22f6SHong Zhang   PetscFunctionReturn(0);
1760c8db22f6SHong Zhang }
1761c8db22f6SHong Zhang 
1762b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
17638972f759SHong Zhang {
17648972f759SHong Zhang   PetscErrorCode    ierr;
1765b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
17661683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
17671683a169SBarry Smith   PetscScalar       *ca=csp->a;
1768f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1769e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1770077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1771077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
17728972f759SHong Zhang 
17738972f759SHong Zhang   PetscFunctionBegin;
17741683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1775f99a636bSHong Zhang 
1776077f23c2SHong Zhang   if (brows > 0) {
1777077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1778980ae229SHong Zhang     lstart = matcoloring->lstart;
1779580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1780eeb4fd02SHong Zhang 
1781077f23c2SHong Zhang     row_end = brows;
1782eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1783077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1784077f23c2SHong Zhang       ca_den_ptr = ca_den;
1785980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1786eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1787eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1788eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1789eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1790eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1791eeb4fd02SHong Zhang             lstart[k] = l;
1792eeb4fd02SHong Zhang             break;
1793eeb4fd02SHong Zhang           } else {
1794077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1795eeb4fd02SHong Zhang           }
1796eeb4fd02SHong Zhang         }
1797077f23c2SHong Zhang         ca_den_ptr += m;
1798eeb4fd02SHong Zhang       }
1799077f23c2SHong Zhang       row_end += brows;
1800eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1801eeb4fd02SHong Zhang     }
1802077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1803077f23c2SHong Zhang     ca_den_ptr = ca_den;
1804077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1805077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1806077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1807077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1808077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1809077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1810077f23c2SHong Zhang       }
1811077f23c2SHong Zhang       ca_den_ptr += m;
1812077f23c2SHong Zhang     }
1813f99a636bSHong Zhang   }
1814f99a636bSHong Zhang 
18151683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1816f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1817077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1818f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1819e88777f2SHong Zhang   } else {
1820077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1821e88777f2SHong Zhang   }
1822f99a636bSHong Zhang #endif
18238972f759SHong Zhang   PetscFunctionReturn(0);
18248972f759SHong Zhang }
18258972f759SHong Zhang 
1826b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1827b1683b59SHong Zhang {
1828b1683b59SHong Zhang   PetscErrorCode ierr;
1829e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
18301a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1831b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1832b1683b59SHong Zhang   IS             *isa;
1833b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1834e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1835e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1836e88777f2SHong Zhang   PetscBool      flg;
1837b1683b59SHong Zhang 
1838b1683b59SHong Zhang   PetscFunctionBegin;
1839071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1840e99be685SHong Zhang 
18417ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1842e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1843b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1844e88777f2SHong Zhang   c->N      = Nbs;
1845e88777f2SHong Zhang   c->m      = c->M;
1846b1683b59SHong Zhang   c->rstart = 0;
1847e88777f2SHong Zhang   c->brows  = 100;
1848b1683b59SHong Zhang 
1849b1683b59SHong Zhang   c->ncolors = nis;
1850dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1851854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1852854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1853e88777f2SHong Zhang 
1854e88777f2SHong Zhang   brows = c->brows;
1855c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1856e88777f2SHong Zhang   if (flg) c->brows = brows;
1857eeb4fd02SHong Zhang   if (brows > 0) {
1858854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1859e88777f2SHong Zhang   }
18602205254eSKarl Rupp 
1861d2853540SHong Zhang   colorforrow[0] = 0;
1862d2853540SHong Zhang   rows_i         = rows;
1863f99a636bSHong Zhang   den2sp_i       = den2sp;
1864b1683b59SHong Zhang 
1865854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1866854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
18672205254eSKarl Rupp 
1868d2853540SHong Zhang   colorforcol[0] = 0;
1869d2853540SHong Zhang   columns_i      = columns;
1870d2853540SHong Zhang 
1871077f23c2SHong Zhang   /* get column-wise storage of mat */
1872077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1873b1683b59SHong Zhang 
1874b28d80bdSHong Zhang   cm   = c->m;
1875854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1876854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1877f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1878b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1879b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
18802205254eSKarl Rupp 
1881b1683b59SHong Zhang     c->ncolumns[i] = n;
1882b1683b59SHong Zhang     if (n) {
1883580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1884b1683b59SHong Zhang     }
1885d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1886d2853540SHong Zhang     columns_i       += n;
1887b1683b59SHong Zhang 
1888b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1889580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1890e99be685SHong Zhang 
1891b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1892b1683b59SHong Zhang       col     = is[j];
1893d2853540SHong Zhang       row_idx = cj + ci[col];
1894b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1895b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1896e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1897d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1898b1683b59SHong Zhang       }
1899b1683b59SHong Zhang     }
1900b1683b59SHong Zhang     /* count the number of hits */
1901b1683b59SHong Zhang     nrows = 0;
1902e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1903b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1904b1683b59SHong Zhang     }
1905b1683b59SHong Zhang     c->nrows[i]      = nrows;
1906d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1907d2853540SHong Zhang 
1908b1683b59SHong Zhang     nrows = 0;
1909b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1910b1683b59SHong Zhang       if (rowhit[j]) {
1911d2853540SHong Zhang         rows_i[nrows]   = j;
191212b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1913b1683b59SHong Zhang         nrows++;
1914b1683b59SHong Zhang       }
1915b1683b59SHong Zhang     }
1916e88777f2SHong Zhang     den2sp_i += nrows;
1917077f23c2SHong Zhang 
1918b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1919f99a636bSHong Zhang     rows_i += nrows;
1920b1683b59SHong Zhang   }
19210298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1922b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1923071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1924d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1925b28d80bdSHong Zhang 
1926d2853540SHong Zhang   c->colorforrow = colorforrow;
1927d2853540SHong Zhang   c->rows        = rows;
1928f99a636bSHong Zhang   c->den2sp      = den2sp;
1929d2853540SHong Zhang   c->colorforcol = colorforcol;
1930d2853540SHong Zhang   c->columns     = columns;
19312205254eSKarl Rupp 
1932f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1933b1683b59SHong Zhang   PetscFunctionReturn(0);
1934b1683b59SHong Zhang }
1935d013fc79Sandi selinger 
19364222ddf1SHong Zhang /* --------------------------------------------------------------- */
19374222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1938df97dc6dSFande Kong {
19394222ddf1SHong Zhang   PetscErrorCode ierr;
19404222ddf1SHong Zhang   Mat_Product    *product = C->product;
19414222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19424222ddf1SHong Zhang 
1943df97dc6dSFande Kong   PetscFunctionBegin;
19444222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19454222ddf1SHong Zhang     /* Alg: "outerproduct" */
19466718818eSStefano Zampini     ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
19474222ddf1SHong Zhang   } else {
19484222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19496718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19506718818eSStefano Zampini     Mat                 At;
19514222ddf1SHong Zhang 
19526718818eSStefano Zampini     if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
19536718818eSStefano Zampini     At = atb->At;
1954089a957eSStefano Zampini     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
19554222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
19564222ddf1SHong Zhang     }
1957089a957eSStefano Zampini     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr);
19584222ddf1SHong Zhang     atb->updateAt = PETSC_TRUE;
19594222ddf1SHong Zhang   }
1960df97dc6dSFande Kong   PetscFunctionReturn(0);
1961df97dc6dSFande Kong }
1962df97dc6dSFande Kong 
19634222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1964d013fc79Sandi selinger {
1965d013fc79Sandi selinger   PetscErrorCode ierr;
19664222ddf1SHong Zhang   Mat_Product    *product = C->product;
19674222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19684222ddf1SHong Zhang   PetscReal      fill=product->fill;
1969d013fc79Sandi selinger 
1970d013fc79Sandi selinger   PetscFunctionBegin;
19714222ddf1SHong Zhang   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
19722869b61bSandi selinger 
19734222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19744222ddf1SHong Zhang   PetscFunctionReturn(0);
19752869b61bSandi selinger }
1976d013fc79Sandi selinger 
19774222ddf1SHong Zhang /* --------------------------------------------------------------- */
19784222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
19794222ddf1SHong Zhang {
19804222ddf1SHong Zhang   PetscErrorCode ierr;
19814222ddf1SHong Zhang   Mat_Product    *product = C->product;
19824222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19834222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19844222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19854222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
19864222ddf1SHong Zhang   PetscInt       nalg = 7;
19874222ddf1SHong Zhang #else
19884222ddf1SHong Zhang   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
19894222ddf1SHong Zhang   PetscInt       nalg = 8;
19904222ddf1SHong Zhang #endif
19914222ddf1SHong Zhang 
19924222ddf1SHong Zhang   PetscFunctionBegin;
19934222ddf1SHong Zhang   /* Set default algorithm */
19944222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
19954222ddf1SHong Zhang   if (flg) {
19964222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1997d013fc79Sandi selinger   }
1998d013fc79Sandi selinger 
19994222ddf1SHong Zhang   /* Get runtime option */
20004222ddf1SHong Zhang   if (product->api_user) {
20014222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
20024222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20034222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20044222ddf1SHong Zhang   } else {
20054222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
20064222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20074222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2008d013fc79Sandi selinger   }
20094222ddf1SHong Zhang   if (flg) {
20104222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2011d013fc79Sandi selinger   }
2012d013fc79Sandi selinger 
20134222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20144222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20154222ddf1SHong Zhang   PetscFunctionReturn(0);
20164222ddf1SHong Zhang }
2017d013fc79Sandi selinger 
20184222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
20194222ddf1SHong Zhang {
20204222ddf1SHong Zhang   PetscErrorCode ierr;
20214222ddf1SHong Zhang   Mat_Product    *product = C->product;
20224222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20234222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
2024089a957eSStefano Zampini   const char     *algTypes[3] = {"default","at*b","outerproduct"};
2025089a957eSStefano Zampini   PetscInt       nalg = 3;
2026d013fc79Sandi selinger 
20274222ddf1SHong Zhang   PetscFunctionBegin;
20284222ddf1SHong Zhang   /* Get runtime option */
20294222ddf1SHong Zhang   if (product->api_user) {
20304222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
20314222ddf1SHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20324222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20334222ddf1SHong Zhang   } else {
20344222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
20354222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20364222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20374222ddf1SHong Zhang   }
20384222ddf1SHong Zhang   if (flg) {
20394222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20404222ddf1SHong Zhang   }
2041d013fc79Sandi selinger 
20424222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20434222ddf1SHong Zhang   PetscFunctionReturn(0);
20444222ddf1SHong Zhang }
20454222ddf1SHong Zhang 
20464222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
20474222ddf1SHong Zhang {
20484222ddf1SHong Zhang   PetscErrorCode ierr;
20494222ddf1SHong Zhang   Mat_Product    *product = C->product;
20504222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20514222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20524222ddf1SHong Zhang   const char     *algTypes[2] = {"default","color"};
20534222ddf1SHong Zhang   PetscInt       nalg = 2;
20544222ddf1SHong Zhang 
20554222ddf1SHong Zhang   PetscFunctionBegin;
20564222ddf1SHong Zhang   /* Set default algorithm */
20574222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20584222ddf1SHong Zhang   if (!flg) {
20594222ddf1SHong Zhang     alg = 1;
20604222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20614222ddf1SHong Zhang   }
20624222ddf1SHong Zhang 
20634222ddf1SHong Zhang   /* Get runtime option */
20644222ddf1SHong Zhang   if (product->api_user) {
20654222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
20664222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20674222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20684222ddf1SHong Zhang   } else {
20694222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
20704222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20714222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20724222ddf1SHong Zhang   }
20734222ddf1SHong Zhang   if (flg) {
20744222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20754222ddf1SHong Zhang   }
20764222ddf1SHong Zhang 
20774222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20784222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20794222ddf1SHong Zhang   PetscFunctionReturn(0);
20804222ddf1SHong Zhang }
20814222ddf1SHong Zhang 
20824222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
20834222ddf1SHong Zhang {
20844222ddf1SHong Zhang   PetscErrorCode ierr;
20854222ddf1SHong Zhang   Mat_Product    *product = C->product;
20864222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20874222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
20884222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20894222ddf1SHong Zhang   const char      *algTypes[2] = {"scalable","rap"};
20904222ddf1SHong Zhang   PetscInt        nalg = 2;
20914222ddf1SHong Zhang #else
20924222ddf1SHong Zhang   const char      *algTypes[3] = {"scalable","rap","hypre"};
20934222ddf1SHong Zhang   PetscInt        nalg = 3;
20944222ddf1SHong Zhang #endif
20954222ddf1SHong Zhang 
20964222ddf1SHong Zhang   PetscFunctionBegin;
20974222ddf1SHong Zhang   /* Set default algorithm */
20984222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
20994222ddf1SHong Zhang   if (flg) {
21004222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21014222ddf1SHong Zhang   }
21024222ddf1SHong Zhang 
21034222ddf1SHong Zhang   /* Get runtime option */
21044222ddf1SHong Zhang   if (product->api_user) {
21054222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
21064222ddf1SHong Zhang     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21074222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21084222ddf1SHong Zhang   } else {
21094222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
21104222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21114222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21124222ddf1SHong Zhang   }
21134222ddf1SHong Zhang   if (flg) {
21144222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21154222ddf1SHong Zhang   }
21164222ddf1SHong Zhang 
21174222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21184222ddf1SHong Zhang   PetscFunctionReturn(0);
21194222ddf1SHong Zhang }
21204222ddf1SHong Zhang 
21214222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
21224222ddf1SHong Zhang {
21234222ddf1SHong Zhang   PetscErrorCode ierr;
21244222ddf1SHong Zhang   Mat_Product    *product = C->product;
21254222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21264222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21274222ddf1SHong Zhang   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
21284222ddf1SHong Zhang   PetscInt        nalg = 3;
21294222ddf1SHong Zhang 
21304222ddf1SHong Zhang   PetscFunctionBegin;
21314222ddf1SHong Zhang   /* Set default algorithm */
21324222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21334222ddf1SHong Zhang   if (flg) {
21344222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21354222ddf1SHong Zhang   }
21364222ddf1SHong Zhang 
21374222ddf1SHong Zhang   /* Get runtime option */
21384222ddf1SHong Zhang   if (product->api_user) {
21394222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
21404222ddf1SHong Zhang     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21414222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21424222ddf1SHong Zhang   } else {
21434222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
21444222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21454222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21464222ddf1SHong Zhang   }
21474222ddf1SHong Zhang   if (flg) {
21484222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21494222ddf1SHong Zhang   }
21504222ddf1SHong Zhang 
21514222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21524222ddf1SHong Zhang   PetscFunctionReturn(0);
21534222ddf1SHong Zhang }
21544222ddf1SHong Zhang 
21554222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
21564222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
21574222ddf1SHong Zhang {
21584222ddf1SHong Zhang   PetscErrorCode ierr;
21594222ddf1SHong Zhang   Mat_Product    *product = C->product;
21604222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21614222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21624222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
21634222ddf1SHong Zhang   PetscInt       nalg = 7;
21644222ddf1SHong Zhang 
21654222ddf1SHong Zhang   PetscFunctionBegin;
21664222ddf1SHong Zhang   /* Set default algorithm */
21674222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21684222ddf1SHong Zhang   if (flg) {
21694222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21704222ddf1SHong Zhang   }
21714222ddf1SHong Zhang 
21724222ddf1SHong Zhang   /* Get runtime option */
21734222ddf1SHong Zhang   if (product->api_user) {
21744222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
21754222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21764222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21774222ddf1SHong Zhang   } else {
21784222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
21794222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21804222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21814222ddf1SHong Zhang   }
21824222ddf1SHong Zhang   if (flg) {
21834222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21844222ddf1SHong Zhang   }
21854222ddf1SHong Zhang 
21864222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21874222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21884222ddf1SHong Zhang   PetscFunctionReturn(0);
21894222ddf1SHong Zhang }
21904222ddf1SHong Zhang 
21914222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
21924222ddf1SHong Zhang {
21934222ddf1SHong Zhang   PetscErrorCode ierr;
21944222ddf1SHong Zhang   Mat_Product    *product = C->product;
21954222ddf1SHong Zhang 
21964222ddf1SHong Zhang   PetscFunctionBegin;
21974222ddf1SHong Zhang   switch (product->type) {
21984222ddf1SHong Zhang   case MATPRODUCT_AB:
21994222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
22004222ddf1SHong Zhang     break;
22014222ddf1SHong Zhang   case MATPRODUCT_AtB:
22024222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
22034222ddf1SHong Zhang     break;
22044222ddf1SHong Zhang   case MATPRODUCT_ABt:
22054222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
22064222ddf1SHong Zhang     break;
22074222ddf1SHong Zhang   case MATPRODUCT_PtAP:
22084222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
22094222ddf1SHong Zhang     break;
22104222ddf1SHong Zhang   case MATPRODUCT_RARt:
22114222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
22124222ddf1SHong Zhang     break;
22134222ddf1SHong Zhang   case MATPRODUCT_ABC:
22144222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
22154222ddf1SHong Zhang     break;
22166718818eSStefano Zampini   default:
22176718818eSStefano Zampini     break;
22184222ddf1SHong Zhang   }
2219d013fc79Sandi selinger   PetscFunctionReturn(0);
2220d013fc79Sandi selinger }
2221