xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision f4259b3095b7e7c300c2b3c3729548b071684bd7)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14df97dc6dSFande Kong {
15df97dc6dSFande Kong   PetscErrorCode ierr;
16df97dc6dSFande Kong 
17df97dc6dSFande Kong   PetscFunctionBegin;
18df97dc6dSFande Kong   if (C->ops->matmultnumeric) {
196718818eSStefano Zampini     if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
20df97dc6dSFande Kong     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
21df97dc6dSFande Kong   } else {
22df97dc6dSFande Kong     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
23df97dc6dSFande Kong   }
24df97dc6dSFande Kong   PetscFunctionReturn(0);
25df97dc6dSFande Kong }
26df97dc6dSFande Kong 
274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
29df97dc6dSFande Kong {
30df97dc6dSFande Kong   PetscErrorCode ierr;
314222ddf1SHong Zhang   PetscInt       ii;
324222ddf1SHong Zhang   Mat_SeqAIJ     *aij;
33e4e71118SStefano Zampini   PetscBool      isseqaij;
345c66b693SKris Buschelman 
355c66b693SKris Buschelman   PetscFunctionBegin;
364222ddf1SHong Zhang   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
374222ddf1SHong Zhang   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
384222ddf1SHong Zhang 
39e4e71118SStefano Zampini   if (!mtype) {
40e4e71118SStefano Zampini     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
41e4e71118SStefano Zampini     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
42e4e71118SStefano Zampini   } else {
43e4e71118SStefano Zampini     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
44e4e71118SStefano Zampini   }
45*f4259b30SLisandro Dalcin   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
464222ddf1SHong Zhang   aij  = (Mat_SeqAIJ*)(mat)->data;
474222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
484222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
494222ddf1SHong Zhang 
504222ddf1SHong Zhang   aij->i            = i;
514222ddf1SHong Zhang   aij->j            = j;
524222ddf1SHong Zhang   aij->a            = a;
534222ddf1SHong Zhang   aij->singlemalloc = PETSC_FALSE;
544222ddf1SHong Zhang   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
554222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
564222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
574222ddf1SHong Zhang 
584222ddf1SHong Zhang   for (ii=0; ii<m; ii++) {
594222ddf1SHong Zhang     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
6025023028SHong Zhang   }
614222ddf1SHong Zhang 
625c66b693SKris Buschelman   PetscFunctionReturn(0);
635c66b693SKris Buschelman }
641c24bd37SHong Zhang 
654222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
664222ddf1SHong Zhang {
674222ddf1SHong Zhang   PetscErrorCode      ierr;
684222ddf1SHong Zhang   Mat_Product         *product = C->product;
694222ddf1SHong Zhang   MatProductAlgorithm alg;
704222ddf1SHong Zhang   PetscBool           flg;
714222ddf1SHong Zhang 
724222ddf1SHong Zhang   PetscFunctionBegin;
734222ddf1SHong Zhang   if (product) {
744222ddf1SHong Zhang     alg = product->alg;
754222ddf1SHong Zhang   } else {
764222ddf1SHong Zhang     alg = "sorted";
774222ddf1SHong Zhang   }
784222ddf1SHong Zhang   /* sorted */
794222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
804222ddf1SHong Zhang   if (flg) {
814222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
824222ddf1SHong Zhang     PetscFunctionReturn(0);
834222ddf1SHong Zhang   }
844222ddf1SHong Zhang 
854222ddf1SHong Zhang   /* scalable */
864222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
874222ddf1SHong Zhang   if (flg) {
884222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
894222ddf1SHong Zhang     PetscFunctionReturn(0);
904222ddf1SHong Zhang   }
914222ddf1SHong Zhang 
924222ddf1SHong Zhang   /* scalable_fast */
934222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
944222ddf1SHong Zhang   if (flg) {
954222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
964222ddf1SHong Zhang     PetscFunctionReturn(0);
974222ddf1SHong Zhang   }
984222ddf1SHong Zhang 
994222ddf1SHong Zhang   /* heap */
1004222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
1014222ddf1SHong Zhang   if (flg) {
1024222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
1034222ddf1SHong Zhang     PetscFunctionReturn(0);
1044222ddf1SHong Zhang   }
1054222ddf1SHong Zhang 
1064222ddf1SHong Zhang   /* btheap */
1074222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
1084222ddf1SHong Zhang   if (flg) {
1094222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
1104222ddf1SHong Zhang     PetscFunctionReturn(0);
1114222ddf1SHong Zhang   }
1124222ddf1SHong Zhang 
1134222ddf1SHong Zhang   /* llcondensed */
1144222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
1154222ddf1SHong Zhang   if (flg) {
1164222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
1174222ddf1SHong Zhang     PetscFunctionReturn(0);
1184222ddf1SHong Zhang   }
1194222ddf1SHong Zhang 
1204222ddf1SHong Zhang   /* rowmerge */
1214222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
1224222ddf1SHong Zhang   if (flg) {
1234222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
1244222ddf1SHong Zhang     PetscFunctionReturn(0);
1254222ddf1SHong Zhang   }
1264222ddf1SHong Zhang 
1274222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1284222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
1294222ddf1SHong Zhang   if (flg) {
1304222ddf1SHong Zhang     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
1314222ddf1SHong Zhang     PetscFunctionReturn(0);
1324222ddf1SHong Zhang   }
1334222ddf1SHong Zhang #endif
1344222ddf1SHong Zhang 
1354222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1364222ddf1SHong Zhang   PetscFunctionReturn(0);
1374222ddf1SHong Zhang }
1384222ddf1SHong Zhang 
1394222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
140b561aa9dSHong Zhang {
141b561aa9dSHong Zhang   PetscErrorCode     ierr;
142b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
143971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
144c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
145b561aa9dSHong Zhang   PetscReal          afill;
146eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
147eca6b86aSHong Zhang   PetscTable         ta;
148fb908031SHong Zhang   PetscBT            lnkbt;
1490298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
150b561aa9dSHong Zhang 
151b561aa9dSHong Zhang   PetscFunctionBegin;
152bd958071SHong Zhang   /* Get ci and cj */
153bd958071SHong Zhang   /*---------------*/
154fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
155fb908031SHong Zhang   /* free space for accumulating nonzero column info */
156854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
157fb908031SHong Zhang   ci[0] = 0;
158fb908031SHong Zhang 
159fb908031SHong Zhang   /* create and initialize a linked list */
160c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
161c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
162eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
163eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
164eca6b86aSHong Zhang 
165eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
166fb908031SHong Zhang 
167fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
168f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1692205254eSKarl Rupp 
170fb908031SHong Zhang   current_space = free_space;
171fb908031SHong Zhang 
172fb908031SHong Zhang   /* Determine ci and cj */
173fb908031SHong Zhang   for (i=0; i<am; i++) {
174fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
175fb908031SHong Zhang     aj   = a->j + ai[i];
176fb908031SHong Zhang     for (j=0; j<anzi; j++) {
177fb908031SHong Zhang       brow = aj[j];
178fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
179fb908031SHong Zhang       bj   = b->j + bi[brow];
180fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
181fb908031SHong Zhang       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
182fb908031SHong Zhang     }
183fb908031SHong Zhang     cnzi = lnk[0];
184fb908031SHong Zhang 
185fb908031SHong Zhang     /* If free space is not available, make more free space */
186fb908031SHong Zhang     /* Double the amount of total space in the list */
187fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
188f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
189fb908031SHong Zhang       ndouble++;
190fb908031SHong Zhang     }
191fb908031SHong Zhang 
192fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
193fb908031SHong Zhang     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1942205254eSKarl Rupp 
195fb908031SHong Zhang     current_space->array           += cnzi;
196fb908031SHong Zhang     current_space->local_used      += cnzi;
197fb908031SHong Zhang     current_space->local_remaining -= cnzi;
1982205254eSKarl Rupp 
199fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
200fb908031SHong Zhang   }
201fb908031SHong Zhang 
202fb908031SHong Zhang   /* Column indices are in the list of free space */
203fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
204fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
205854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
206fb908031SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
207fb908031SHong Zhang   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
208b561aa9dSHong Zhang 
20926be0446SHong Zhang   /* put together the new symbolic matrix */
210e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
2114222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
21258c24d83SHong Zhang 
21358c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
21458c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2154222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
216aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
217e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
21858c24d83SHong Zhang   c->nonew   = 0;
2194222ddf1SHong Zhang 
2204222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2214222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2220b7e3e3dSHong Zhang 
2237212da7cSHong Zhang   /* set MatInfo */
2247212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
225f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2267212da7cSHong Zhang   c->maxnz                  = ci[am];
2277212da7cSHong Zhang   c->nz                     = ci[am];
2284222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2294222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2304222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2317212da7cSHong Zhang 
2327212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2337212da7cSHong Zhang   if (ci[am]) {
2344222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
2354222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
236f2b054eeSHong Zhang   } else {
2374222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
238be0fcf8dSHong Zhang   }
239f2b054eeSHong Zhang #endif
24058c24d83SHong Zhang   PetscFunctionReturn(0);
24158c24d83SHong Zhang }
242d50806bdSBarry Smith 
243df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
244d50806bdSBarry Smith {
245dfbe8321SBarry Smith   PetscErrorCode ierr;
246d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
247d50806bdSBarry Smith   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
248d50806bdSBarry Smith   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
249d50806bdSBarry Smith   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
25038baddfdSBarry Smith   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
251c58ca2e3SHong Zhang   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
252519eb980SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
253aa1aec99SHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
254aa1aec99SHong Zhang   PetscScalar    *ab_dense;
2556718818eSStefano Zampini   PetscContainer cab_dense;
256d50806bdSBarry Smith 
257d50806bdSBarry Smith   PetscFunctionBegin;
258aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
259854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
260aa1aec99SHong Zhang     c->a      = ca;
261aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2626718818eSStefano Zampini   } else ca = c->a;
2636718818eSStefano Zampini 
2646718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2656718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2666718818eSStefano Zampini      that is hard to eradicate) */
2676718818eSStefano Zampini   ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr);
2686718818eSStefano Zampini   if (!cab_dense) {
2696718818eSStefano Zampini     ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
2706718818eSStefano Zampini     ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr);
2716718818eSStefano Zampini     ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr);
2726718818eSStefano Zampini     ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr);
2736718818eSStefano Zampini     ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr);
2746718818eSStefano Zampini     ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr);
275d90d86d0SMatthew G. Knepley   }
2766718818eSStefano Zampini   ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr);
2776718818eSStefano Zampini   ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr);
278aa1aec99SHong Zhang 
279c124e916SHong Zhang   /* clean old values in C */
280580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
281d50806bdSBarry Smith   /* Traverse A row-wise. */
282d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
283d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
284d50806bdSBarry Smith   for (i=0; i<am; i++) {
285d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
286d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
287519eb980SHong Zhang       brow = aj[j];
288d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
289d50806bdSBarry Smith       bjj  = bj + bi[brow];
290d50806bdSBarry Smith       baj  = ba + bi[brow];
291519eb980SHong Zhang       /* perform dense axpy */
29236ec6d2dSHong Zhang       valtmp = aa[j];
293519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
29436ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
295519eb980SHong Zhang       }
296519eb980SHong Zhang       flops += 2*bnzi;
297519eb980SHong Zhang     }
298c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
299c58ca2e3SHong Zhang 
300c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
301519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
302519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
303519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
304519eb980SHong Zhang     }
305519eb980SHong Zhang     flops += cnzi;
306519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
307519eb980SHong Zhang   }
308c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
309c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
310c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
311c58ca2e3SHong Zhang   PetscFunctionReturn(0);
312c58ca2e3SHong Zhang }
313c58ca2e3SHong Zhang 
31425023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
315c58ca2e3SHong Zhang {
316c58ca2e3SHong Zhang   PetscErrorCode ierr;
317c58ca2e3SHong Zhang   PetscLogDouble flops=0.0;
318c58ca2e3SHong Zhang   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
319c58ca2e3SHong Zhang   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
320c58ca2e3SHong Zhang   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
321c58ca2e3SHong Zhang   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
322c58ca2e3SHong Zhang   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
323c58ca2e3SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
32436ec6d2dSHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
325c58ca2e3SHong Zhang   PetscInt       nextb;
326c58ca2e3SHong Zhang 
327c58ca2e3SHong Zhang   PetscFunctionBegin;
328cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
329cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
330cf742fccSHong Zhang     c->a      = ca;
331cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
332cf742fccSHong Zhang   }
333cf742fccSHong Zhang 
334c58ca2e3SHong Zhang   /* clean old values in C */
335580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
336c58ca2e3SHong Zhang   /* Traverse A row-wise. */
337c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
338c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
339519eb980SHong Zhang   for (i=0; i<am; i++) {
340519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
341519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
342519eb980SHong Zhang     for (j=0; j<anzi; j++) {
343519eb980SHong Zhang       brow = aj[j];
344519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
345519eb980SHong Zhang       bjj  = bj + bi[brow];
346519eb980SHong Zhang       baj  = ba + bi[brow];
347519eb980SHong Zhang       /* perform sparse axpy */
34836ec6d2dSHong Zhang       valtmp = aa[j];
349c124e916SHong Zhang       nextb  = 0;
350c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
351c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
35236ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
353c124e916SHong Zhang         }
354d50806bdSBarry Smith       }
355d50806bdSBarry Smith       flops += 2*bnzi;
356d50806bdSBarry Smith     }
357519eb980SHong Zhang     aj += anzi; aa += anzi;
358519eb980SHong Zhang     cj += cnzi; ca += cnzi;
359d50806bdSBarry Smith   }
360716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
361716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
362d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
363d50806bdSBarry Smith   PetscFunctionReturn(0);
364d50806bdSBarry Smith }
365bc011b1eSHong Zhang 
3664222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
36725296bd5SBarry Smith {
36825296bd5SBarry Smith   PetscErrorCode     ierr;
36925296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
37025296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3713c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
37225296bd5SBarry Smith   MatScalar          *ca;
37325296bd5SBarry Smith   PetscReal          afill;
374eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
375eca6b86aSHong Zhang   PetscTable         ta;
3760298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
37725296bd5SBarry Smith 
37825296bd5SBarry Smith   PetscFunctionBegin;
3793c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3803c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3813c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
382854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
3833c50cad2SHong Zhang   ci[0] = 0;
3843c50cad2SHong Zhang 
3853c50cad2SHong Zhang   /* create and initialize a linked list */
386c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
387c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
388eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
389eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
390eca6b86aSHong Zhang 
391eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
3923c50cad2SHong Zhang 
3933c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
394f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
3953c50cad2SHong Zhang   current_space = free_space;
3963c50cad2SHong Zhang 
3973c50cad2SHong Zhang   /* Determine ci and cj */
3983c50cad2SHong Zhang   for (i=0; i<am; i++) {
3993c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
4003c50cad2SHong Zhang     aj   = a->j + ai[i];
4013c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
4023c50cad2SHong Zhang       brow = aj[j];
4033c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
4043c50cad2SHong Zhang       bj   = b->j + bi[brow];
4053c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4063c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
4073c50cad2SHong Zhang     }
4083c50cad2SHong Zhang     cnzi = lnk[1];
4093c50cad2SHong Zhang 
4103c50cad2SHong Zhang     /* If free space is not available, make more free space */
4113c50cad2SHong Zhang     /* Double the amount of total space in the list */
4123c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
413f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
4143c50cad2SHong Zhang       ndouble++;
4153c50cad2SHong Zhang     }
4163c50cad2SHong Zhang 
4173c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4183c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4192205254eSKarl Rupp 
4203c50cad2SHong Zhang     current_space->array           += cnzi;
4213c50cad2SHong Zhang     current_space->local_used      += cnzi;
4223c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4232205254eSKarl Rupp 
4243c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
4253c50cad2SHong Zhang   }
4263c50cad2SHong Zhang 
4273c50cad2SHong Zhang   /* Column indices are in the list of free space */
4283c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4293c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
430854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
4313c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
4323c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
43325296bd5SBarry Smith 
43425296bd5SBarry Smith   /* Allocate space for ca */
435580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
43625296bd5SBarry Smith 
43725296bd5SBarry Smith   /* put together the new symbolic matrix */
438e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
4394222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
44025296bd5SBarry Smith 
44125296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
44225296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4434222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
44425296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
44525296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
44625296bd5SBarry Smith   c->nonew   = 0;
4472205254eSKarl Rupp 
4484222ddf1SHong Zhang   /* slower, less memory */
4494222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
45025296bd5SBarry Smith 
45125296bd5SBarry Smith   /* set MatInfo */
45225296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
45325296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
45425296bd5SBarry Smith   c->maxnz                     = ci[am];
45525296bd5SBarry Smith   c->nz                        = ci[am];
4564222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4574222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4584222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
45925296bd5SBarry Smith 
46025296bd5SBarry Smith #if defined(PETSC_USE_INFO)
46125296bd5SBarry Smith   if (ci[am]) {
4624222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
4634222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
46425296bd5SBarry Smith   } else {
4654222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
46625296bd5SBarry Smith   }
46725296bd5SBarry Smith #endif
46825296bd5SBarry Smith   PetscFunctionReturn(0);
46925296bd5SBarry Smith }
47025296bd5SBarry Smith 
4714222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
472e9e4536cSHong Zhang {
473e9e4536cSHong Zhang   PetscErrorCode     ierr;
474e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
475bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
47625c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
477e9e4536cSHong Zhang   MatScalar          *ca;
478e9e4536cSHong Zhang   PetscReal          afill;
479eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
480eca6b86aSHong Zhang   PetscTable         ta;
4810298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
482e9e4536cSHong Zhang 
483e9e4536cSHong Zhang   PetscFunctionBegin;
484bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
485bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
486bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
487854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
488bd958071SHong Zhang   ci[0] = 0;
489bd958071SHong Zhang 
490bd958071SHong Zhang   /* create and initialize a linked list */
491c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
492c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
493eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
494eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
495eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
496bd958071SHong Zhang 
497bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
498f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
499bd958071SHong Zhang   current_space = free_space;
500bd958071SHong Zhang 
501bd958071SHong Zhang   /* Determine ci and cj */
502bd958071SHong Zhang   for (i=0; i<am; i++) {
503bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
504bd958071SHong Zhang     aj   = a->j + ai[i];
505bd958071SHong Zhang     for (j=0; j<anzi; j++) {
506bd958071SHong Zhang       brow = aj[j];
507bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
508bd958071SHong Zhang       bj   = b->j + bi[brow];
509bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
510bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
511bd958071SHong Zhang     }
512bd958071SHong Zhang     cnzi = lnk[0];
513bd958071SHong Zhang 
514bd958071SHong Zhang     /* If free space is not available, make more free space */
515bd958071SHong Zhang     /* Double the amount of total space in the list */
516bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
517f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
518bd958071SHong Zhang       ndouble++;
519bd958071SHong Zhang     }
520bd958071SHong Zhang 
521bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
522bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
5232205254eSKarl Rupp 
524bd958071SHong Zhang     current_space->array           += cnzi;
525bd958071SHong Zhang     current_space->local_used      += cnzi;
526bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5272205254eSKarl Rupp 
528bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
529bd958071SHong Zhang   }
530bd958071SHong Zhang 
531bd958071SHong Zhang   /* Column indices are in the list of free space */
532bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
533bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
534854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
535bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
536bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
537e9e4536cSHong Zhang 
538e9e4536cSHong Zhang   /* Allocate space for ca */
539bd958071SHong Zhang   /*-----------------------*/
540580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
541e9e4536cSHong Zhang 
542e9e4536cSHong Zhang   /* put together the new symbolic matrix */
543e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
5444222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
545e9e4536cSHong Zhang 
546e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
547e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5484222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
549e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
550e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
551e9e4536cSHong Zhang   c->nonew   = 0;
5522205254eSKarl Rupp 
5534222ddf1SHong Zhang   /* slower, less memory */
5544222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
555e9e4536cSHong Zhang 
556e9e4536cSHong Zhang   /* set MatInfo */
557e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
558e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
559e9e4536cSHong Zhang   c->maxnz                     = ci[am];
560e9e4536cSHong Zhang   c->nz                        = ci[am];
5614222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5624222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5634222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
564e9e4536cSHong Zhang 
565e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
566e9e4536cSHong Zhang   if (ci[am]) {
5674222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
5684222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
569e9e4536cSHong Zhang   } else {
5704222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
571e9e4536cSHong Zhang   }
572e9e4536cSHong Zhang #endif
573e9e4536cSHong Zhang   PetscFunctionReturn(0);
574e9e4536cSHong Zhang }
575e9e4536cSHong Zhang 
5764222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
5770ced3a2bSJed Brown {
5780ced3a2bSJed Brown   PetscErrorCode     ierr;
5790ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
5800ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
5810ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
5820ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
5830ced3a2bSJed Brown   PetscReal          afill;
5840ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
5850298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
5860ced3a2bSJed Brown   PetscHeap          h;
5870ced3a2bSJed Brown 
5880ced3a2bSJed Brown   PetscFunctionBegin;
589cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
5900ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
5910ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
592854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
5930ced3a2bSJed Brown   ci[0] = 0;
5940ced3a2bSJed Brown 
5950ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
596f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
5970ced3a2bSJed Brown   current_space = free_space;
5980ced3a2bSJed Brown 
5990ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
600785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6010ced3a2bSJed Brown 
6020ced3a2bSJed Brown   /* Determine ci and cj */
6030ced3a2bSJed Brown   for (i=0; i<am; i++) {
6040ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6050ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6060ced3a2bSJed Brown     ci[i+1] = ci[i];
6070ced3a2bSJed Brown     /* Populate the min heap */
6080ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
6090ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
6100ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
6110ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
6120ced3a2bSJed Brown       }
6130ced3a2bSJed Brown     }
6140ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6150ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6160ced3a2bSJed Brown     while (j >= 0) {
6170ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
618f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6190ced3a2bSJed Brown         ndouble++;
6200ced3a2bSJed Brown       }
6210ced3a2bSJed Brown       *(current_space->array++) = col;
6220ced3a2bSJed Brown       current_space->local_used++;
6230ced3a2bSJed Brown       current_space->local_remaining--;
6240ced3a2bSJed Brown       ci[i+1]++;
6250ced3a2bSJed Brown 
6260ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6270ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
6280ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
6290ced3a2bSJed Brown         PetscInt j2,col2;
6300ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
6310ced3a2bSJed Brown         if (col2 != col) break;
6320ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
6330ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
6340ced3a2bSJed Brown       }
6350ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6360ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
6370ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6380ced3a2bSJed Brown     }
6390ced3a2bSJed Brown   }
6400ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6410ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
6420ced3a2bSJed Brown 
6430ced3a2bSJed Brown   /* Column indices are in the list of free space */
6440ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6450ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
646785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
6470ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
6480ced3a2bSJed Brown 
6490ced3a2bSJed Brown   /* put together the new symbolic matrix */
650e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
6514222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
6520ced3a2bSJed Brown 
6530ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6540ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6554222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
6560ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6570ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6580ced3a2bSJed Brown   c->nonew   = 0;
65926fbe8dcSKarl Rupp 
6604222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6610ced3a2bSJed Brown 
6620ced3a2bSJed Brown   /* set MatInfo */
6630ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6640ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6650ced3a2bSJed Brown   c->maxnz                     = ci[am];
6660ced3a2bSJed Brown   c->nz                        = ci[am];
6674222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6684222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6694222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6700ced3a2bSJed Brown 
6710ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6720ced3a2bSJed Brown   if (ci[am]) {
6734222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
6744222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
6750ced3a2bSJed Brown   } else {
6764222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
6770ced3a2bSJed Brown   }
6780ced3a2bSJed Brown #endif
6790ced3a2bSJed Brown   PetscFunctionReturn(0);
6800ced3a2bSJed Brown }
681e9e4536cSHong Zhang 
6824222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
6838a07c6f1SJed Brown {
6848a07c6f1SJed Brown   PetscErrorCode     ierr;
6858a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6868a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6878a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
6888a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6898a07c6f1SJed Brown   PetscReal          afill;
6908a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
6910298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6928a07c6f1SJed Brown   PetscHeap          h;
6938a07c6f1SJed Brown   PetscBT            bt;
6948a07c6f1SJed Brown 
6958a07c6f1SJed Brown   PetscFunctionBegin;
696cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
6978a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
6988a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
699854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
7008a07c6f1SJed Brown   ci[0] = 0;
7018a07c6f1SJed Brown 
7028a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
703f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
7042205254eSKarl Rupp 
7058a07c6f1SJed Brown   current_space = free_space;
7068a07c6f1SJed Brown 
7078a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
708785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
7098a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
7108a07c6f1SJed Brown 
7118a07c6f1SJed Brown   /* Determine ci and cj */
7128a07c6f1SJed Brown   for (i=0; i<am; i++) {
7138a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7148a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
7158a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7168a07c6f1SJed Brown     ci[i+1] = ci[i];
7178a07c6f1SJed Brown     /* Populate the min heap */
7188a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
7198a07c6f1SJed Brown       PetscInt brow = acol[j];
7208a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
7218a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7228a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7238a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7248a07c6f1SJed Brown           bb[j]++;
7258a07c6f1SJed Brown           break;
7268a07c6f1SJed Brown         }
7278a07c6f1SJed Brown       }
7288a07c6f1SJed Brown     }
7298a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7308a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7318a07c6f1SJed Brown     while (j >= 0) {
7328a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7330298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
734f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
7358a07c6f1SJed Brown         ndouble++;
7368a07c6f1SJed Brown       }
7378a07c6f1SJed Brown       *(current_space->array++) = col;
7388a07c6f1SJed Brown       current_space->local_used++;
7398a07c6f1SJed Brown       current_space->local_remaining--;
7408a07c6f1SJed Brown       ci[i+1]++;
7418a07c6f1SJed Brown 
7428a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7438a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
7448a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7458a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7468a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7478a07c6f1SJed Brown           bb[j]++;
7488a07c6f1SJed Brown           break;
7498a07c6f1SJed Brown         }
7508a07c6f1SJed Brown       }
7518a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7528a07c6f1SJed Brown     }
7538a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7548a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7558a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7568a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7578a07c6f1SJed Brown     }
7588a07c6f1SJed Brown   }
7598a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
7608a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7618a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7628a07c6f1SJed Brown 
7638a07c6f1SJed Brown   /* Column indices are in the list of free space */
7648a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7658a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
766785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7678a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7688a07c6f1SJed Brown 
7698a07c6f1SJed Brown   /* put together the new symbolic matrix */
770e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
7714222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
7728a07c6f1SJed Brown 
7738a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7748a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7754222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
7768a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7778a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7788a07c6f1SJed Brown   c->nonew   = 0;
77926fbe8dcSKarl Rupp 
7804222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7818a07c6f1SJed Brown 
7828a07c6f1SJed Brown   /* set MatInfo */
7838a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7848a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7858a07c6f1SJed Brown   c->maxnz                     = ci[am];
7868a07c6f1SJed Brown   c->nz                        = ci[am];
7874222ddf1SHong Zhang   C->info.mallocs           = ndouble;
7884222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
7894222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
7908a07c6f1SJed Brown 
7918a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7928a07c6f1SJed Brown   if (ci[am]) {
7934222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
7944222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7958a07c6f1SJed Brown   } else {
7964222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
7978a07c6f1SJed Brown   }
7988a07c6f1SJed Brown #endif
7998a07c6f1SJed Brown   PetscFunctionReturn(0);
8008a07c6f1SJed Brown }
8018a07c6f1SJed Brown 
802d7ed1a4dSandi selinger 
8034222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
804d7ed1a4dSandi selinger {
805d7ed1a4dSandi selinger   PetscErrorCode     ierr;
806d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
807d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
808d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
809d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
810d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
811d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
812d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
813d7ed1a4dSandi selinger   PetscInt           window[8];
814d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
815d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
816d7ed1a4dSandi selinger   PetscReal          afill;
817f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
8187660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
819d7ed1a4dSandi selinger 
820d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
821d7ed1a4dSandi selinger              Because of the way virtual memory works,
822d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
823d7ed1a4dSandi selinger   PetscFunctionBegin;
824d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
825d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
826d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
827d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
828d7ed1a4dSandi selinger     a_rownnz = 0;
829d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
830d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
831d7ed1a4dSandi selinger       if (a_rownnz > bn) {
832d7ed1a4dSandi selinger         a_rownnz = bn;
833d7ed1a4dSandi selinger         break;
834d7ed1a4dSandi selinger       }
835d7ed1a4dSandi selinger     }
836d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
837d7ed1a4dSandi selinger   }
838d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
839d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
840f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
841f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
842d7ed1a4dSandi selinger 
8437660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8447660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
845d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
846d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
847d7ed1a4dSandi selinger 
848d7ed1a4dSandi selinger   ci_nnz       = 0;
849d7ed1a4dSandi selinger   ci[0]        = 0;
850d7ed1a4dSandi selinger   worki_L1[0]  = 0;
851d7ed1a4dSandi selinger   worki_L2[0]  = 0;
852d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
853d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
854d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
855d7ed1a4dSandi selinger     rowsleft             = anzi;
856d7ed1a4dSandi selinger     inputcol_L1          = acol;
857d7ed1a4dSandi selinger     L2_nnz               = 0;
8587660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8597660c4dbSandi selinger     worki_L2[1]          = 0;
86008fe1b3cSKarl Rupp     outputi_nnz          = 0;
861d7ed1a4dSandi selinger 
862d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
863d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
864d7ed1a4dSandi selinger       c_maxmem *= 2;
865d7ed1a4dSandi selinger       ndouble++;
866d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
867d7ed1a4dSandi selinger     }
868d7ed1a4dSandi selinger 
869d7ed1a4dSandi selinger     while (rowsleft) {
870d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
871d7ed1a4dSandi selinger       L1_nrows    = 0;
8727660c4dbSandi selinger       L1_nnz      = 0;
873d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
874d7ed1a4dSandi selinger       inputi      = bi;
875d7ed1a4dSandi selinger       inputj      = bj;
876d7ed1a4dSandi selinger 
877d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
878d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
879f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
880d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
881d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
882d7ed1a4dSandi selinger          window_min  = bn;                                                   \
8837660c4dbSandi selinger          outputi_nnz = 0;                                                    \
8847660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
885d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
886d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
887d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
888d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
889d7ed1a4dSandi selinger          }                                                                   \
890d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
891d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
892d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
893d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
894d7ed1a4dSandi selinger            window_min = bn;                                                  \
895d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
896d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
897d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
898d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
899d7ed1a4dSandi selinger              }                                                               \
900d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
901d7ed1a4dSandi selinger            }                                                                 \
902d7ed1a4dSandi selinger          }
903d7ed1a4dSandi selinger 
904d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
905d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
906d7ed1a4dSandi selinger       while (L1_rowsleft) {
9077660c4dbSandi selinger         outputi_nnz = 0;
9087660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
9097660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
9107660c4dbSandi selinger 
911d7ed1a4dSandi selinger         switch (L1_rowsleft) {
912d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
913d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
914d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
915d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
916d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
917d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
918d7ed1a4dSandi selinger                  break;
919d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
920d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
921d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
922d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
923d7ed1a4dSandi selinger                  break;
924d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
925d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
926d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
927d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
928d7ed1a4dSandi selinger                  break;
929d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
930d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
931d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
932d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
933d7ed1a4dSandi selinger                  break;
934d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
935d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
936d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
937d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
938d7ed1a4dSandi selinger                  break;
939d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
940d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
941d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
942d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
943d7ed1a4dSandi selinger                  break;
944d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
945d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
946d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
947d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
948d7ed1a4dSandi selinger                  break;
949d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
950d7ed1a4dSandi selinger                  inputcol    += 8;
951d7ed1a4dSandi selinger                  rowsleft    -= 8;
952d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
953d7ed1a4dSandi selinger                  break;
954d7ed1a4dSandi selinger         }
955d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9567660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9577660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
958d7ed1a4dSandi selinger       }
959d7ed1a4dSandi selinger 
960d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
961d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
962d7ed1a4dSandi selinger       if (anzi > 8) {
963d7ed1a4dSandi selinger         inputi      = worki_L1;
964d7ed1a4dSandi selinger         inputj      = workj_L1;
965d7ed1a4dSandi selinger         inputcol    = workcol;
966d7ed1a4dSandi selinger         outputi_nnz = 0;
967d7ed1a4dSandi selinger 
968d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
969d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
970d7ed1a4dSandi selinger 
971d7ed1a4dSandi selinger         switch (L1_nrows) {
972d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
973d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
974d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
975d7ed1a4dSandi selinger                  break;
976d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
977d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
978d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
979d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
980d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
981d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
982d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
983d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
984d7ed1a4dSandi selinger         }
985d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
986d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
987d7ed1a4dSandi selinger 
9887660c4dbSandi selinger         /************************ L E V E L  3 **********************/
9897660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
990d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
991d7ed1a4dSandi selinger           inputi      = worki_L2;
992d7ed1a4dSandi selinger           inputj      = workj_L2;
993d7ed1a4dSandi selinger           inputcol    = workcol;
994d7ed1a4dSandi selinger           outputi_nnz = 0;
995f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
996d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
997d7ed1a4dSandi selinger           switch (L2_nrows) {
998d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
999d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1000d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1001d7ed1a4dSandi selinger                    break;
1002d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1003d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1004d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1005d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1006d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1007d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1008d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1009d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1010d7ed1a4dSandi selinger           }
1011d7ed1a4dSandi selinger           L2_nrows    = 1;
10127660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10137660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10147660c4dbSandi selinger           /* Copy to workj_L2 */
1015d7ed1a4dSandi selinger           if (rowsleft) {
10167660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1017d7ed1a4dSandi selinger           }
1018d7ed1a4dSandi selinger         }
1019d7ed1a4dSandi selinger       }
1020d7ed1a4dSandi selinger     }  /* while (rowsleft) */
1021d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1022d7ed1a4dSandi selinger 
1023d7ed1a4dSandi selinger     /* terminate current row */
1024d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1025d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
1026d7ed1a4dSandi selinger   }
1027d7ed1a4dSandi selinger 
1028d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
1029e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
10304222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1031d7ed1a4dSandi selinger 
1032d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1033d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10344222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1035d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1036d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1037d7ed1a4dSandi selinger   c->nonew   = 0;
1038d7ed1a4dSandi selinger 
10394222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1040d7ed1a4dSandi selinger 
1041d7ed1a4dSandi selinger   /* set MatInfo */
1042d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1043d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
1044d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
1045d7ed1a4dSandi selinger   c->nz                        = ci[am];
10464222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10474222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10484222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1049d7ed1a4dSandi selinger 
1050d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1051d7ed1a4dSandi selinger   if (ci[am]) {
10524222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
10534222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1054d7ed1a4dSandi selinger   } else {
10554222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1056d7ed1a4dSandi selinger   }
1057d7ed1a4dSandi selinger #endif
1058d7ed1a4dSandi selinger 
1059d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1060d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1061d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1062f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1063d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1064d7ed1a4dSandi selinger }
1065d7ed1a4dSandi selinger 
1066cd093f1dSJed Brown /* concatenate unique entries and then sort */
10674222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1068cd093f1dSJed Brown {
1069cd093f1dSJed Brown   PetscErrorCode ierr;
1070cd093f1dSJed Brown   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1071cd093f1dSJed Brown   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1072cd093f1dSJed Brown   PetscInt       *ci,*cj;
1073cd093f1dSJed Brown   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1074cd093f1dSJed Brown   PetscReal      afill;
1075cd093f1dSJed Brown   PetscInt       i,j,ndouble = 0;
1076cd093f1dSJed Brown   PetscSegBuffer seg,segrow;
1077cd093f1dSJed Brown   char           *seen;
1078cd093f1dSJed Brown 
1079cd093f1dSJed Brown   PetscFunctionBegin;
1080854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1081cd093f1dSJed Brown   ci[0] = 0;
1082cd093f1dSJed Brown 
1083cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1084cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1085cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1086580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1087cd093f1dSJed Brown 
1088cd093f1dSJed Brown   /* Determine ci and cj */
1089cd093f1dSJed Brown   for (i=0; i<am; i++) {
1090cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1091cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1092cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1093cd093f1dSJed Brown     /* Pack segrow */
1094cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1095cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1096cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
1097cd093f1dSJed Brown         PetscInt bcol = bj[k];
1098cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1099cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1100cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1101cd093f1dSJed Brown           *slot = bcol;
1102cd093f1dSJed Brown           seen[bcol] = 1;
1103cd093f1dSJed Brown           packlen++;
1104cd093f1dSJed Brown         }
1105cd093f1dSJed Brown       }
1106cd093f1dSJed Brown     }
1107cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1108cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1109cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1110cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1111cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1112cd093f1dSJed Brown   }
1113cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1114cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1115cd093f1dSJed Brown 
1116cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1117cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1118cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1119cd093f1dSJed Brown 
1120cd093f1dSJed Brown   /* put together the new symbolic matrix */
1121e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
11224222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1123cd093f1dSJed Brown 
1124cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1125cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11264222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1127cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1128cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1129cd093f1dSJed Brown   c->nonew   = 0;
1130cd093f1dSJed Brown 
11314222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1132cd093f1dSJed Brown 
1133cd093f1dSJed Brown   /* set MatInfo */
11342a09556fSStefano Zampini   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1135cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1136cd093f1dSJed Brown   c->maxnz                  = ci[am];
1137cd093f1dSJed Brown   c->nz                     = ci[am];
11384222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11394222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11404222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1141cd093f1dSJed Brown 
1142cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1143cd093f1dSJed Brown   if (ci[am]) {
11444222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
11454222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1146cd093f1dSJed Brown   } else {
11474222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1148cd093f1dSJed Brown   }
1149cd093f1dSJed Brown #endif
1150cd093f1dSJed Brown   PetscFunctionReturn(0);
1151cd093f1dSJed Brown }
1152cd093f1dSJed Brown 
11536718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
11542128a86cSHong Zhang {
11552128a86cSHong Zhang   PetscErrorCode      ierr;
11566718818eSStefano Zampini   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
11572128a86cSHong Zhang 
11582128a86cSHong Zhang   PetscFunctionBegin;
115940192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
116040192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
116140192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
116240192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
11632128a86cSHong Zhang   PetscFunctionReturn(0);
11642128a86cSHong Zhang }
11652128a86cSHong Zhang 
11664222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1167bc011b1eSHong Zhang {
11685df89d91SHong Zhang   PetscErrorCode      ierr;
116981d82fe4SHong Zhang   Mat                 Bt;
117081d82fe4SHong Zhang   PetscInt            *bti,*btj;
117140192850SHong Zhang   Mat_MatMatTransMult *abt;
11724222ddf1SHong Zhang   Mat_Product         *product = C->product;
11736718818eSStefano Zampini   char                *alg;
1174d2853540SHong Zhang 
117581d82fe4SHong Zhang   PetscFunctionBegin;
11766718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
11776718818eSStefano Zampini   if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
11786718818eSStefano Zampini 
117981d82fe4SHong Zhang   /* create symbolic Bt */
118081d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
11810298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
118233d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
118304b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
118481d82fe4SHong Zhang 
118581d82fe4SHong Zhang   /* get symbolic C=A*Bt */
11866718818eSStefano Zampini   ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr);
11874222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
118881d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
11894222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
11906718818eSStefano Zampini   ierr = PetscFree(alg);CHKERRQ(ierr);
119181d82fe4SHong Zhang 
11922128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1193b00a9115SJed Brown   ierr = PetscNew(&abt);CHKERRQ(ierr);
11942128a86cSHong Zhang 
11956718818eSStefano Zampini   product->data    = abt;
11966718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
11976718818eSStefano Zampini 
11984222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
11992128a86cSHong Zhang 
12004222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12014222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
120240192850SHong Zhang   if (abt->usecoloring) {
1203b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1204b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1205335efc43SPeter Brune     MatColoring          coloring;
12062128a86cSHong Zhang     ISColoring           iscoloring;
12072128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
1208e8354b3eSHong Zhang 
12094222ddf1SHong Zhang     /* inode causes memory problem */
12104222ddf1SHong Zhang     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
12114222ddf1SHong Zhang 
12124222ddf1SHong Zhang     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1213335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1214335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1215335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1216335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1217335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
12184222ddf1SHong Zhang     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
12192205254eSKarl Rupp 
122040192850SHong Zhang     abt->matcoloring = matcoloring;
12212205254eSKarl Rupp 
12222128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
12232128a86cSHong Zhang 
12242128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12252128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
12262128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12272128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
12280298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
12292205254eSKarl Rupp 
12302128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
123140192850SHong Zhang     abt->Bt_den         = Bt_dense;
12322128a86cSHong Zhang 
12332128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
12342128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12352128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
12360298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
12372205254eSKarl Rupp 
12382128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
123940192850SHong Zhang     abt->ABt_den  = C_dense;
1240f94ccd6cSHong Zhang 
1241f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12421ce71dffSSatish Balay     {
12434222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
12444222ddf1SHong Zhang       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
12451ce71dffSSatish Balay     }
1246f94ccd6cSHong Zhang #endif
12472128a86cSHong Zhang   }
1248e99be685SHong Zhang   /* clean up */
1249e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1250e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12515df89d91SHong Zhang   PetscFunctionReturn(0);
12525df89d91SHong Zhang }
12535df89d91SHong Zhang 
12546fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12555df89d91SHong Zhang {
12565df89d91SHong Zhang   PetscErrorCode      ierr;
12575df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1258e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
125981d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12605df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1261aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
12626718818eSStefano Zampini   Mat_MatMatTransMult *abt;
12636718818eSStefano Zampini   Mat_Product         *product = C->product;
12645df89d91SHong Zhang 
12655df89d91SHong Zhang   PetscFunctionBegin;
12666718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
12676718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
12686718818eSStefano Zampini   if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
126958ed3ceaSHong Zhang   /* clear old values in C */
127058ed3ceaSHong Zhang   if (!c->a) {
1271580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
127258ed3ceaSHong Zhang     c->a      = ca;
127358ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
127458ed3ceaSHong Zhang   } else {
127558ed3ceaSHong Zhang     ca =  c->a;
1276580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
127758ed3ceaSHong Zhang   }
127858ed3ceaSHong Zhang 
127940192850SHong Zhang   if (abt->usecoloring) {
128040192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
128140192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1282c8db22f6SHong Zhang 
1283b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
128440192850SHong Zhang     Bt_dense = abt->Bt_den;
1285b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1286c8db22f6SHong Zhang 
1287c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
12882128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1289c8db22f6SHong Zhang 
12902128a86cSHong Zhang     /* Recover C from C_dense */
1291b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1292c8db22f6SHong Zhang     PetscFunctionReturn(0);
1293c8db22f6SHong Zhang   }
1294c8db22f6SHong Zhang 
12951fa1209cSHong Zhang   for (i=0; i<cm; i++) {
129681d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
129781d82fe4SHong Zhang     acol = aj + ai[i];
12986973769bSHong Zhang     aval = aa + ai[i];
12991fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
13001fa1209cSHong Zhang     ccol = cj + ci[i];
13016973769bSHong Zhang     cval = ca + ci[i];
13021fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
130381d82fe4SHong Zhang       brow = ccol[j];
130481d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
130581d82fe4SHong Zhang       bcol = bj + bi[brow];
13066973769bSHong Zhang       bval = ba + bi[brow];
13076973769bSHong Zhang 
130881d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
130981d82fe4SHong Zhang       nexta = 0; nextb = 0;
131081d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
13117b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
131281d82fe4SHong Zhang         if (nexta == anzi) break;
13137b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
131481d82fe4SHong Zhang         if (nextb == bnzj) break;
131581d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13166973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
131781d82fe4SHong Zhang           nexta++; nextb++;
131881d82fe4SHong Zhang           flops += 2;
13191fa1209cSHong Zhang         }
13201fa1209cSHong Zhang       }
132181d82fe4SHong Zhang     }
132281d82fe4SHong Zhang   }
132381d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
132481d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
132581d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
13265df89d91SHong Zhang   PetscFunctionReturn(0);
13275df89d91SHong Zhang }
13285df89d91SHong Zhang 
13296718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
13306d373c3eSHong Zhang {
13316d373c3eSHong Zhang   PetscErrorCode      ierr;
13326718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
13336d373c3eSHong Zhang 
13346d373c3eSHong Zhang   PetscFunctionBegin;
13356d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
13366718818eSStefano Zampini   if (atb->destroy) {
13376718818eSStefano Zampini     ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr);
13386473ade5SStefano Zampini   }
13396d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
13406d373c3eSHong Zhang   PetscFunctionReturn(0);
13416d373c3eSHong Zhang }
13426d373c3eSHong Zhang 
13434222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
13445df89d91SHong Zhang {
1345bc011b1eSHong Zhang   PetscErrorCode      ierr;
1346bc011b1eSHong Zhang   Mat                 At;
134738baddfdSBarry Smith   PetscInt            *ati,*atj;
13484222ddf1SHong Zhang   Mat_Product         *product = C->product;
13494222ddf1SHong Zhang   MatProductAlgorithm alg;
13504222ddf1SHong Zhang   PetscBool           flg;
1351bc011b1eSHong Zhang 
1352bc011b1eSHong Zhang   PetscFunctionBegin;
13534222ddf1SHong Zhang   if (product) {
13544222ddf1SHong Zhang     alg = product->alg;
13554222ddf1SHong Zhang   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"!product, not supported yet");
13564222ddf1SHong Zhang 
13574222ddf1SHong Zhang   /* outerproduct */
13584222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"outerproduct",&flg);CHKERRQ(ierr);
13594222ddf1SHong Zhang   if (flg) {
1360bc011b1eSHong Zhang     /* create symbolic At */
1361bc011b1eSHong Zhang     ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13620298fd71SBarry Smith     ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
136333d57670SJed Brown     ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
136404b858e0SBarry Smith     ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1365bc011b1eSHong Zhang 
1366bc011b1eSHong Zhang     /* get symbolic C=At*B */
13677a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1368bc011b1eSHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1369bc011b1eSHong Zhang 
1370bc011b1eSHong Zhang     /* clean up */
13716bf464f9SBarry Smith     ierr = MatDestroy(&At);CHKERRQ(ierr);
1372bc011b1eSHong Zhang     ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13736d373c3eSHong Zhang 
13744222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
13757a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr);
13764222ddf1SHong Zhang     PetscFunctionReturn(0);
13774222ddf1SHong Zhang   }
13784222ddf1SHong Zhang 
13794222ddf1SHong Zhang   /* matmatmult */
13804222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"at*b",&flg);CHKERRQ(ierr);
13814222ddf1SHong Zhang   if (flg) {
13824222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
13834222ddf1SHong Zhang 
13846718818eSStefano Zampini     if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
13854222ddf1SHong Zhang     ierr = PetscNew(&atb);CHKERRQ(ierr);
13864222ddf1SHong Zhang     ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
13877a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
13884222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
13896718818eSStefano Zampini     ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr);
13906718818eSStefano Zampini     product->data    = atb;
13916718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
13924222ddf1SHong Zhang     atb->At          = At;
13934222ddf1SHong Zhang     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
13944222ddf1SHong Zhang 
13954222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
13964222ddf1SHong Zhang     PetscFunctionReturn(0);
13974222ddf1SHong Zhang   }
13984222ddf1SHong Zhang 
13994222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1400bc011b1eSHong Zhang   PetscFunctionReturn(0);
1401bc011b1eSHong Zhang }
1402bc011b1eSHong Zhang 
140375648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1404bc011b1eSHong Zhang {
1405bc011b1eSHong Zhang   PetscErrorCode ierr;
14060fbc74f4SHong Zhang   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1407d0f46423SBarry Smith   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1408d0f46423SBarry Smith   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1409d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1410aa1aec99SHong Zhang   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1411bc011b1eSHong Zhang 
1412bc011b1eSHong Zhang   PetscFunctionBegin;
1413aa1aec99SHong Zhang   if (!c->a) {
1414580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
14152205254eSKarl Rupp 
1416aa1aec99SHong Zhang     c->a      = ca;
1417aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1418aa1aec99SHong Zhang   } else {
1419aa1aec99SHong Zhang     ca   = c->a;
1420580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1421aa1aec99SHong Zhang   }
1422bc011b1eSHong Zhang 
1423bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1424bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1425bc011b1eSHong Zhang     bj   = b->j + bi[i];
1426bc011b1eSHong Zhang     ba   = b->a + bi[i];
1427bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1428bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1429bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1430bc011b1eSHong Zhang       nextb = 0;
14310fbc74f4SHong Zhang       crow  = *aj++;
1432bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1433bc011b1eSHong Zhang       caj   = ca + ci[crow];
1434bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1435bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14360fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14370fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1438bc011b1eSHong Zhang           nextb++;
1439bc011b1eSHong Zhang         }
1440bc011b1eSHong Zhang       }
1441bc011b1eSHong Zhang       flops += 2*bnzi;
14420fbc74f4SHong Zhang       aa++;
1443bc011b1eSHong Zhang     }
1444bc011b1eSHong Zhang   }
1445bc011b1eSHong Zhang 
1446bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1447bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1448bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1449bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1450bc011b1eSHong Zhang   PetscFunctionReturn(0);
1451bc011b1eSHong Zhang }
14529123193aSHong Zhang 
14534222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
14549123193aSHong Zhang {
14559123193aSHong Zhang   PetscErrorCode ierr;
14569123193aSHong Zhang 
14579123193aSHong Zhang   PetscFunctionBegin;
14585a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14594222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14609123193aSHong Zhang   PetscFunctionReturn(0);
14619123193aSHong Zhang }
14629123193aSHong Zhang 
146393aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
14649123193aSHong Zhang {
1465f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1466612438f1SStefano Zampini   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
14671ca9667aSStefano Zampini   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
14689123193aSHong Zhang   PetscErrorCode    ierr;
14691ca9667aSStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1470a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1471daf57211SHong Zhang   const PetscInt    *aj;
1472612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
14731ca9667aSStefano Zampini   PetscInt          clda=cd->lda;
14741ca9667aSStefano Zampini   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
14759123193aSHong Zhang 
14769123193aSHong Zhang   PetscFunctionBegin;
1477f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1478a4af7ca8SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
147993aa15f2SStefano Zampini   if (add) {
14808c778c55SBarry Smith     ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
148193aa15f2SStefano Zampini   } else {
148293aa15f2SStefano Zampini     ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr);
148393aa15f2SStefano Zampini   }
1484a4af7ca8SStefano Zampini   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1485f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
14861ca9667aSStefano Zampini   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
14871ca9667aSStefano Zampini   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
14881ca9667aSStefano Zampini     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1489f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1490f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1491f32d5d43SBarry Smith       aj = a->j + a->i[i];
1492a4af7ca8SStefano Zampini       aa = av + a->i[i];
1493f32d5d43SBarry Smith       for (j=0; j<n; j++) {
14941ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
14951ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1496730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1497730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1498730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1499730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
15009123193aSHong Zhang       }
150193aa15f2SStefano Zampini       if (add) {
150287753ddeSHong Zhang         c1[i] += r1;
150387753ddeSHong Zhang         c2[i] += r2;
150487753ddeSHong Zhang         c3[i] += r3;
150587753ddeSHong Zhang         c4[i] += r4;
150693aa15f2SStefano Zampini       } else {
150793aa15f2SStefano Zampini         c1[i] = r1;
150893aa15f2SStefano Zampini         c2[i] = r2;
150993aa15f2SStefano Zampini         c3[i] = r3;
151093aa15f2SStefano Zampini         c4[i] = r4;
151193aa15f2SStefano Zampini       }
1512f32d5d43SBarry Smith     }
1513730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1514730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1515f32d5d43SBarry Smith   }
151693aa15f2SStefano Zampini   /* process remaining columns */
151793aa15f2SStefano Zampini   if (col != cn) {
151893aa15f2SStefano Zampini     PetscInt rc = cn-col;
151993aa15f2SStefano Zampini 
152093aa15f2SStefano Zampini     if (rc == 1) {
152193aa15f2SStefano Zampini       for (i=0; i<am; i++) {
1522f32d5d43SBarry Smith         r1 = 0.0;
1523f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1524f32d5d43SBarry Smith         aj = a->j + a->i[i];
1525a4af7ca8SStefano Zampini         aa = av + a->i[i];
152693aa15f2SStefano Zampini         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
152793aa15f2SStefano Zampini         if (add) c1[i] += r1;
152893aa15f2SStefano Zampini         else c1[i] = r1;
152993aa15f2SStefano Zampini       }
153093aa15f2SStefano Zampini     } else if (rc == 2) {
153193aa15f2SStefano Zampini       for (i=0; i<am; i++) {
153293aa15f2SStefano Zampini         r1 = r2 = 0.0;
153393aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
153493aa15f2SStefano Zampini         aj = a->j + a->i[i];
153593aa15f2SStefano Zampini         aa = av + a->i[i];
1536f32d5d43SBarry Smith         for (j=0; j<n; j++) {
153793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
153893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
153993aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
154093aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
1541f32d5d43SBarry Smith         }
154293aa15f2SStefano Zampini         if (add) {
154387753ddeSHong Zhang           c1[i] += r1;
154493aa15f2SStefano Zampini           c2[i] += r2;
154593aa15f2SStefano Zampini         } else {
154693aa15f2SStefano Zampini           c1[i] = r1;
154793aa15f2SStefano Zampini           c2[i] = r2;
1548f32d5d43SBarry Smith         }
154993aa15f2SStefano Zampini       }
155093aa15f2SStefano Zampini     } else {
155193aa15f2SStefano Zampini       for (i=0; i<am; i++) {
155293aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
155393aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
155493aa15f2SStefano Zampini         aj = a->j + a->i[i];
155593aa15f2SStefano Zampini         aa = av + a->i[i];
155693aa15f2SStefano Zampini         for (j=0; j<n; j++) {
155793aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
155893aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
155993aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
156093aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
156193aa15f2SStefano Zampini           r3 += aatmp*b3[ajtmp];
156293aa15f2SStefano Zampini         }
156393aa15f2SStefano Zampini         if (add) {
156493aa15f2SStefano Zampini           c1[i] += r1;
156593aa15f2SStefano Zampini           c2[i] += r2;
156693aa15f2SStefano Zampini           c3[i] += r3;
156793aa15f2SStefano Zampini         } else {
156893aa15f2SStefano Zampini           c1[i] = r1;
156993aa15f2SStefano Zampini           c2[i] = r2;
157093aa15f2SStefano Zampini           c3[i] = r3;
157193aa15f2SStefano Zampini         }
157293aa15f2SStefano Zampini       }
157393aa15f2SStefano Zampini     }
1574f32d5d43SBarry Smith   }
1575dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
157693aa15f2SStefano Zampini   if (add) {
15778c778c55SBarry Smith     ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
157893aa15f2SStefano Zampini   } else {
157993aa15f2SStefano Zampini     ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr);
158093aa15f2SStefano Zampini   }
1581a4af7ca8SStefano Zampini   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1582a4af7ca8SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1583f32d5d43SBarry Smith   PetscFunctionReturn(0);
1584f32d5d43SBarry Smith }
1585f32d5d43SBarry Smith 
158687753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1587f32d5d43SBarry Smith {
1588f32d5d43SBarry Smith   PetscErrorCode ierr;
1589f32d5d43SBarry Smith 
1590f32d5d43SBarry Smith   PetscFunctionBegin;
159187753ddeSHong Zhang   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
159287753ddeSHong Zhang   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
159387753ddeSHong Zhang   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
15944324174eSBarry Smith 
159593aa15f2SStefano Zampini   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr);
15969123193aSHong Zhang   PetscFunctionReturn(0);
15979123193aSHong Zhang }
1598b1683b59SHong Zhang 
15994222ddf1SHong Zhang /* ------------------------------------------------------- */
16004222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
16014222ddf1SHong Zhang {
16024222ddf1SHong Zhang   PetscFunctionBegin;
16034222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16044222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16054222ddf1SHong Zhang   PetscFunctionReturn(0);
16064222ddf1SHong Zhang }
16074222ddf1SHong Zhang 
16086718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
16096718818eSStefano Zampini 
16104222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
16114222ddf1SHong Zhang {
16124222ddf1SHong Zhang   PetscFunctionBegin;
16136718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16144222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16156718818eSStefano Zampini   PetscFunctionReturn(0);
16166718818eSStefano Zampini }
16176718818eSStefano Zampini 
16186718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
16196718818eSStefano Zampini {
16206718818eSStefano Zampini   PetscFunctionBegin;
16216718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16226718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16234222ddf1SHong Zhang   PetscFunctionReturn(0);
16244222ddf1SHong Zhang }
16254222ddf1SHong Zhang 
16264222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
16274222ddf1SHong Zhang {
16284222ddf1SHong Zhang   PetscErrorCode ierr;
16294222ddf1SHong Zhang   Mat_Product    *product = C->product;
16304222ddf1SHong Zhang 
16314222ddf1SHong Zhang   PetscFunctionBegin;
16324222ddf1SHong Zhang   switch (product->type) {
16334222ddf1SHong Zhang   case MATPRODUCT_AB:
16344222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
16354222ddf1SHong Zhang     break;
16364222ddf1SHong Zhang   case MATPRODUCT_AtB:
16374222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
16384222ddf1SHong Zhang     break;
16396718818eSStefano Zampini   case MATPRODUCT_ABt:
16406718818eSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr);
16414222ddf1SHong Zhang     break;
16424222ddf1SHong Zhang   default:
16436718818eSStefano Zampini     break;
16444222ddf1SHong Zhang   }
16454222ddf1SHong Zhang   PetscFunctionReturn(0);
16464222ddf1SHong Zhang }
16474222ddf1SHong Zhang /* ------------------------------------------------------- */
16484222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
16494222ddf1SHong Zhang {
16504222ddf1SHong Zhang   PetscErrorCode ierr;
16514222ddf1SHong Zhang   Mat_Product    *product = C->product;
16524222ddf1SHong Zhang   Mat            A = product->A;
16534222ddf1SHong Zhang   PetscBool      baij;
16544222ddf1SHong Zhang 
16554222ddf1SHong Zhang   PetscFunctionBegin;
16564222ddf1SHong Zhang   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
16574222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
16584222ddf1SHong Zhang     PetscBool sbaij;
16594222ddf1SHong Zhang     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
16604222ddf1SHong Zhang     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
16614222ddf1SHong Zhang 
16624222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
16634222ddf1SHong Zhang   } else { /* A is seqbaij */
16644222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
16654222ddf1SHong Zhang   }
16664222ddf1SHong Zhang 
16674222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16684222ddf1SHong Zhang   PetscFunctionReturn(0);
16694222ddf1SHong Zhang }
16704222ddf1SHong Zhang 
16714222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
16724222ddf1SHong Zhang {
16734222ddf1SHong Zhang   PetscErrorCode ierr;
16744222ddf1SHong Zhang   Mat_Product    *product = C->product;
16754222ddf1SHong Zhang 
16764222ddf1SHong Zhang   PetscFunctionBegin;
16776718818eSStefano Zampini   MatCheckProduct(C,1);
16786718818eSStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
16796718818eSStefano Zampini   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
16804222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
16816718818eSStefano Zampini   }
16824222ddf1SHong Zhang   PetscFunctionReturn(0);
16834222ddf1SHong Zhang }
16846718818eSStefano Zampini 
16854222ddf1SHong Zhang /* ------------------------------------------------------- */
16864222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
16874222ddf1SHong Zhang {
16884222ddf1SHong Zhang   PetscFunctionBegin;
16894222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
16904222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16914222ddf1SHong Zhang   PetscFunctionReturn(0);
16924222ddf1SHong Zhang }
16934222ddf1SHong Zhang 
16944222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
16954222ddf1SHong Zhang {
16964222ddf1SHong Zhang   PetscErrorCode ierr;
16974222ddf1SHong Zhang   Mat_Product    *product = C->product;
16984222ddf1SHong Zhang 
16994222ddf1SHong Zhang   PetscFunctionBegin;
17004222ddf1SHong Zhang   if (product->type == MATPRODUCT_AB) {
17014222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
17026718818eSStefano Zampini   }
17034222ddf1SHong Zhang   PetscFunctionReturn(0);
17044222ddf1SHong Zhang }
17054222ddf1SHong Zhang /* ------------------------------------------------------- */
17064222ddf1SHong Zhang 
1707b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1708c8db22f6SHong Zhang {
1709c8db22f6SHong Zhang   PetscErrorCode ierr;
17102f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
17112f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
17122f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
17132f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
17142f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
17152f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1716c8db22f6SHong Zhang 
1717c8db22f6SHong Zhang   PetscFunctionBegin;
17182f5f1f90SHong Zhang   btval_den=btdense->v;
1719580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
17202f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
17212f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17222f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1723d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17242f5f1f90SHong Zhang       btcol = bj + bi[col];
17252f5f1f90SHong Zhang       btval = ba + bi[col];
17262f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1727d2853540SHong Zhang       for (j=0; j<anz; j++) {
17282f5f1f90SHong Zhang         brow            = btcol[j];
17292f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1730c8db22f6SHong Zhang       }
1731c8db22f6SHong Zhang     }
17322f5f1f90SHong Zhang     btval_den += m;
1733c8db22f6SHong Zhang   }
1734c8db22f6SHong Zhang   PetscFunctionReturn(0);
1735c8db22f6SHong Zhang }
1736c8db22f6SHong Zhang 
1737b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
17388972f759SHong Zhang {
17398972f759SHong Zhang   PetscErrorCode    ierr;
1740b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
17411683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
17421683a169SBarry Smith   PetscScalar       *ca=csp->a;
1743f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1744e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1745077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1746077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
17478972f759SHong Zhang 
17488972f759SHong Zhang   PetscFunctionBegin;
17491683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1750f99a636bSHong Zhang 
1751077f23c2SHong Zhang   if (brows > 0) {
1752077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1753980ae229SHong Zhang     lstart = matcoloring->lstart;
1754580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1755eeb4fd02SHong Zhang 
1756077f23c2SHong Zhang     row_end = brows;
1757eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1758077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1759077f23c2SHong Zhang       ca_den_ptr = ca_den;
1760980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1761eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1762eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1763eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1764eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1765eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1766eeb4fd02SHong Zhang             lstart[k] = l;
1767eeb4fd02SHong Zhang             break;
1768eeb4fd02SHong Zhang           } else {
1769077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1770eeb4fd02SHong Zhang           }
1771eeb4fd02SHong Zhang         }
1772077f23c2SHong Zhang         ca_den_ptr += m;
1773eeb4fd02SHong Zhang       }
1774077f23c2SHong Zhang       row_end += brows;
1775eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1776eeb4fd02SHong Zhang     }
1777077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1778077f23c2SHong Zhang     ca_den_ptr = ca_den;
1779077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1780077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1781077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1782077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1783077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1784077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1785077f23c2SHong Zhang       }
1786077f23c2SHong Zhang       ca_den_ptr += m;
1787077f23c2SHong Zhang     }
1788f99a636bSHong Zhang   }
1789f99a636bSHong Zhang 
17901683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1791f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1792077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1793f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1794e88777f2SHong Zhang   } else {
1795077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1796e88777f2SHong Zhang   }
1797f99a636bSHong Zhang #endif
17988972f759SHong Zhang   PetscFunctionReturn(0);
17998972f759SHong Zhang }
18008972f759SHong Zhang 
1801b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1802b1683b59SHong Zhang {
1803b1683b59SHong Zhang   PetscErrorCode ierr;
1804e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
18051a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1806b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1807b1683b59SHong Zhang   IS             *isa;
1808b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1809e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1810e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1811e88777f2SHong Zhang   PetscBool      flg;
1812b1683b59SHong Zhang 
1813b1683b59SHong Zhang   PetscFunctionBegin;
1814071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1815e99be685SHong Zhang 
18167ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1817e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1818b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1819e88777f2SHong Zhang   c->N      = Nbs;
1820e88777f2SHong Zhang   c->m      = c->M;
1821b1683b59SHong Zhang   c->rstart = 0;
1822e88777f2SHong Zhang   c->brows  = 100;
1823b1683b59SHong Zhang 
1824b1683b59SHong Zhang   c->ncolors = nis;
1825dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1826854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1827854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1828e88777f2SHong Zhang 
1829e88777f2SHong Zhang   brows = c->brows;
1830c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1831e88777f2SHong Zhang   if (flg) c->brows = brows;
1832eeb4fd02SHong Zhang   if (brows > 0) {
1833854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1834e88777f2SHong Zhang   }
18352205254eSKarl Rupp 
1836d2853540SHong Zhang   colorforrow[0] = 0;
1837d2853540SHong Zhang   rows_i         = rows;
1838f99a636bSHong Zhang   den2sp_i       = den2sp;
1839b1683b59SHong Zhang 
1840854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1841854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
18422205254eSKarl Rupp 
1843d2853540SHong Zhang   colorforcol[0] = 0;
1844d2853540SHong Zhang   columns_i      = columns;
1845d2853540SHong Zhang 
1846077f23c2SHong Zhang   /* get column-wise storage of mat */
1847077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1848b1683b59SHong Zhang 
1849b28d80bdSHong Zhang   cm   = c->m;
1850854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1851854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1852f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1853b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1854b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
18552205254eSKarl Rupp 
1856b1683b59SHong Zhang     c->ncolumns[i] = n;
1857b1683b59SHong Zhang     if (n) {
1858580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1859b1683b59SHong Zhang     }
1860d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1861d2853540SHong Zhang     columns_i       += n;
1862b1683b59SHong Zhang 
1863b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1864580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1865e99be685SHong Zhang 
1866b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1867b1683b59SHong Zhang       col     = is[j];
1868d2853540SHong Zhang       row_idx = cj + ci[col];
1869b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1870b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1871e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1872d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1873b1683b59SHong Zhang       }
1874b1683b59SHong Zhang     }
1875b1683b59SHong Zhang     /* count the number of hits */
1876b1683b59SHong Zhang     nrows = 0;
1877e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1878b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1879b1683b59SHong Zhang     }
1880b1683b59SHong Zhang     c->nrows[i]      = nrows;
1881d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1882d2853540SHong Zhang 
1883b1683b59SHong Zhang     nrows = 0;
1884b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1885b1683b59SHong Zhang       if (rowhit[j]) {
1886d2853540SHong Zhang         rows_i[nrows]   = j;
188712b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1888b1683b59SHong Zhang         nrows++;
1889b1683b59SHong Zhang       }
1890b1683b59SHong Zhang     }
1891e88777f2SHong Zhang     den2sp_i += nrows;
1892077f23c2SHong Zhang 
1893b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1894f99a636bSHong Zhang     rows_i += nrows;
1895b1683b59SHong Zhang   }
18960298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1897b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1898071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1899d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1900b28d80bdSHong Zhang 
1901d2853540SHong Zhang   c->colorforrow = colorforrow;
1902d2853540SHong Zhang   c->rows        = rows;
1903f99a636bSHong Zhang   c->den2sp      = den2sp;
1904d2853540SHong Zhang   c->colorforcol = colorforcol;
1905d2853540SHong Zhang   c->columns     = columns;
19062205254eSKarl Rupp 
1907f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1908b1683b59SHong Zhang   PetscFunctionReturn(0);
1909b1683b59SHong Zhang }
1910d013fc79Sandi selinger 
19114222ddf1SHong Zhang /* --------------------------------------------------------------- */
19124222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1913df97dc6dSFande Kong {
19144222ddf1SHong Zhang   PetscErrorCode ierr;
19154222ddf1SHong Zhang   Mat_Product    *product = C->product;
19164222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19174222ddf1SHong Zhang 
1918df97dc6dSFande Kong   PetscFunctionBegin;
19194222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19204222ddf1SHong Zhang     /* Alg: "outerproduct" */
19216718818eSStefano Zampini     ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
19224222ddf1SHong Zhang   } else {
19234222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19246718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19256718818eSStefano Zampini     Mat                 At;
19264222ddf1SHong Zhang 
19276718818eSStefano Zampini     if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
19286718818eSStefano Zampini     At = atb->At;
19294222ddf1SHong Zhang     if (atb->updateAt) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
19304222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
19314222ddf1SHong Zhang     }
19324222ddf1SHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At,B,C);CHKERRQ(ierr);
19334222ddf1SHong Zhang     atb->updateAt = PETSC_TRUE;
19344222ddf1SHong Zhang   }
1935df97dc6dSFande Kong   PetscFunctionReturn(0);
1936df97dc6dSFande Kong }
1937df97dc6dSFande Kong 
19384222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1939d013fc79Sandi selinger {
1940d013fc79Sandi selinger   PetscErrorCode ierr;
19414222ddf1SHong Zhang   Mat_Product    *product = C->product;
19424222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19434222ddf1SHong Zhang   PetscReal      fill=product->fill;
1944d013fc79Sandi selinger 
1945d013fc79Sandi selinger   PetscFunctionBegin;
19464222ddf1SHong Zhang   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
19472869b61bSandi selinger 
19484222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19494222ddf1SHong Zhang   PetscFunctionReturn(0);
19502869b61bSandi selinger }
1951d013fc79Sandi selinger 
19524222ddf1SHong Zhang /* --------------------------------------------------------------- */
19534222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
19544222ddf1SHong Zhang {
19554222ddf1SHong Zhang   PetscErrorCode ierr;
19564222ddf1SHong Zhang   Mat_Product    *product = C->product;
19574222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19584222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19594222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19604222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
19614222ddf1SHong Zhang   PetscInt       nalg = 7;
19624222ddf1SHong Zhang #else
19634222ddf1SHong Zhang   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
19644222ddf1SHong Zhang   PetscInt       nalg = 8;
19654222ddf1SHong Zhang #endif
19664222ddf1SHong Zhang 
19674222ddf1SHong Zhang   PetscFunctionBegin;
19684222ddf1SHong Zhang   /* Set default algorithm */
19694222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
19704222ddf1SHong Zhang   if (flg) {
19714222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1972d013fc79Sandi selinger   }
1973d013fc79Sandi selinger 
19744222ddf1SHong Zhang   /* Get runtime option */
19754222ddf1SHong Zhang   if (product->api_user) {
19764222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
19774222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
19784222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
19794222ddf1SHong Zhang   } else {
19804222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
19814222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
19824222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1983d013fc79Sandi selinger   }
19844222ddf1SHong Zhang   if (flg) {
19854222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1986d013fc79Sandi selinger   }
1987d013fc79Sandi selinger 
19884222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
19894222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
19904222ddf1SHong Zhang   PetscFunctionReturn(0);
19914222ddf1SHong Zhang }
1992d013fc79Sandi selinger 
19934222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
19944222ddf1SHong Zhang {
19954222ddf1SHong Zhang   PetscErrorCode ierr;
19964222ddf1SHong Zhang   Mat_Product    *product = C->product;
19974222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19984222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19994222ddf1SHong Zhang   const char     *algTypes[2] = {"at*b","outerproduct"};
20004222ddf1SHong Zhang   PetscInt       nalg = 2;
2001d013fc79Sandi selinger 
20024222ddf1SHong Zhang   PetscFunctionBegin;
20034222ddf1SHong Zhang   /* Set default algorithm */
20044222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
20054222ddf1SHong Zhang   if (flg) {
20064222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20074222ddf1SHong Zhang   }
2008d013fc79Sandi selinger 
20094222ddf1SHong Zhang   /* Get runtime option */
20104222ddf1SHong Zhang   if (product->api_user) {
20114222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
20124222ddf1SHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20134222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20144222ddf1SHong Zhang   } else {
20154222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
20164222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20174222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20184222ddf1SHong Zhang   }
20194222ddf1SHong Zhang   if (flg) {
20204222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20214222ddf1SHong Zhang   }
2022d013fc79Sandi selinger 
20234222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20244222ddf1SHong Zhang   PetscFunctionReturn(0);
20254222ddf1SHong Zhang }
20264222ddf1SHong Zhang 
20274222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
20284222ddf1SHong Zhang {
20294222ddf1SHong Zhang   PetscErrorCode ierr;
20304222ddf1SHong Zhang   Mat_Product    *product = C->product;
20314222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20324222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20334222ddf1SHong Zhang   const char     *algTypes[2] = {"default","color"};
20344222ddf1SHong Zhang   PetscInt       nalg = 2;
20354222ddf1SHong Zhang 
20364222ddf1SHong Zhang   PetscFunctionBegin;
20374222ddf1SHong Zhang   /* Set default algorithm */
20384222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20394222ddf1SHong Zhang   if (!flg) {
20404222ddf1SHong Zhang     alg = 1;
20414222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20424222ddf1SHong Zhang   }
20434222ddf1SHong Zhang 
20444222ddf1SHong Zhang   /* Get runtime option */
20454222ddf1SHong Zhang   if (product->api_user) {
20464222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
20474222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20484222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20494222ddf1SHong Zhang   } else {
20504222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
20514222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20524222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20534222ddf1SHong Zhang   }
20544222ddf1SHong Zhang   if (flg) {
20554222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20564222ddf1SHong Zhang   }
20574222ddf1SHong Zhang 
20584222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20594222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20604222ddf1SHong Zhang   PetscFunctionReturn(0);
20614222ddf1SHong Zhang }
20624222ddf1SHong Zhang 
20634222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
20644222ddf1SHong Zhang {
20654222ddf1SHong Zhang   PetscErrorCode ierr;
20664222ddf1SHong Zhang   Mat_Product    *product = C->product;
20674222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20684222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
20694222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20704222ddf1SHong Zhang   const char      *algTypes[2] = {"scalable","rap"};
20714222ddf1SHong Zhang   PetscInt        nalg = 2;
20724222ddf1SHong Zhang #else
20734222ddf1SHong Zhang   const char      *algTypes[3] = {"scalable","rap","hypre"};
20744222ddf1SHong Zhang   PetscInt        nalg = 3;
20754222ddf1SHong Zhang #endif
20764222ddf1SHong Zhang 
20774222ddf1SHong Zhang   PetscFunctionBegin;
20784222ddf1SHong Zhang   /* Set default algorithm */
20794222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
20804222ddf1SHong Zhang   if (flg) {
20814222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20824222ddf1SHong Zhang   }
20834222ddf1SHong Zhang 
20844222ddf1SHong Zhang   /* Get runtime option */
20854222ddf1SHong Zhang   if (product->api_user) {
20864222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
20874222ddf1SHong Zhang     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20884222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20894222ddf1SHong Zhang   } else {
20904222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
20914222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20924222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20934222ddf1SHong Zhang   }
20944222ddf1SHong Zhang   if (flg) {
20954222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20964222ddf1SHong Zhang   }
20974222ddf1SHong Zhang 
20984222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
20994222ddf1SHong Zhang   PetscFunctionReturn(0);
21004222ddf1SHong Zhang }
21014222ddf1SHong Zhang 
21024222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
21034222ddf1SHong Zhang {
21044222ddf1SHong Zhang   PetscErrorCode ierr;
21054222ddf1SHong Zhang   Mat_Product    *product = C->product;
21064222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21074222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21084222ddf1SHong Zhang   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
21094222ddf1SHong Zhang   PetscInt        nalg = 3;
21104222ddf1SHong Zhang 
21114222ddf1SHong Zhang   PetscFunctionBegin;
21124222ddf1SHong Zhang   /* Set default algorithm */
21134222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21144222ddf1SHong Zhang   if (flg) {
21154222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21164222ddf1SHong Zhang   }
21174222ddf1SHong Zhang 
21184222ddf1SHong Zhang   /* Get runtime option */
21194222ddf1SHong Zhang   if (product->api_user) {
21204222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
21214222ddf1SHong Zhang     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21224222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21234222ddf1SHong Zhang   } else {
21244222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
21254222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21264222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21274222ddf1SHong Zhang   }
21284222ddf1SHong Zhang   if (flg) {
21294222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21304222ddf1SHong Zhang   }
21314222ddf1SHong Zhang 
21324222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21334222ddf1SHong Zhang   PetscFunctionReturn(0);
21344222ddf1SHong Zhang }
21354222ddf1SHong Zhang 
21364222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
21374222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
21384222ddf1SHong Zhang {
21394222ddf1SHong Zhang   PetscErrorCode ierr;
21404222ddf1SHong Zhang   Mat_Product    *product = C->product;
21414222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21424222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21434222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
21444222ddf1SHong Zhang   PetscInt       nalg = 7;
21454222ddf1SHong Zhang 
21464222ddf1SHong Zhang   PetscFunctionBegin;
21474222ddf1SHong Zhang   /* Set default algorithm */
21484222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21494222ddf1SHong Zhang   if (flg) {
21504222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21514222ddf1SHong Zhang   }
21524222ddf1SHong Zhang 
21534222ddf1SHong Zhang   /* Get runtime option */
21544222ddf1SHong Zhang   if (product->api_user) {
21554222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
21564222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21574222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21584222ddf1SHong Zhang   } else {
21594222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
21604222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21614222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21624222ddf1SHong Zhang   }
21634222ddf1SHong Zhang   if (flg) {
21644222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21654222ddf1SHong Zhang   }
21664222ddf1SHong Zhang 
21674222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21684222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21694222ddf1SHong Zhang   PetscFunctionReturn(0);
21704222ddf1SHong Zhang }
21714222ddf1SHong Zhang 
21724222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
21734222ddf1SHong Zhang {
21744222ddf1SHong Zhang   PetscErrorCode ierr;
21754222ddf1SHong Zhang   Mat_Product    *product = C->product;
21764222ddf1SHong Zhang 
21774222ddf1SHong Zhang   PetscFunctionBegin;
21784222ddf1SHong Zhang   switch (product->type) {
21794222ddf1SHong Zhang   case MATPRODUCT_AB:
21804222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
21814222ddf1SHong Zhang     break;
21824222ddf1SHong Zhang   case MATPRODUCT_AtB:
21834222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
21844222ddf1SHong Zhang     break;
21854222ddf1SHong Zhang   case MATPRODUCT_ABt:
21864222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
21874222ddf1SHong Zhang     break;
21884222ddf1SHong Zhang   case MATPRODUCT_PtAP:
21894222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
21904222ddf1SHong Zhang     break;
21914222ddf1SHong Zhang   case MATPRODUCT_RARt:
21924222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
21934222ddf1SHong Zhang     break;
21944222ddf1SHong Zhang   case MATPRODUCT_ABC:
21954222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
21964222ddf1SHong Zhang     break;
21976718818eSStefano Zampini   default:
21986718818eSStefano Zampini     break;
21994222ddf1SHong Zhang   }
2200d013fc79Sandi selinger   PetscFunctionReturn(0);
2201d013fc79Sandi selinger }
2202