xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 6718818e67c4802797f2feae43fc3d52878b6955)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14df97dc6dSFande Kong {
15df97dc6dSFande Kong   PetscErrorCode ierr;
16df97dc6dSFande Kong 
17df97dc6dSFande Kong   PetscFunctionBegin;
18df97dc6dSFande Kong   if (C->ops->matmultnumeric) {
19*6718818eSStefano Zampini     if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
20df97dc6dSFande Kong     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
21df97dc6dSFande Kong   } else {
22df97dc6dSFande Kong     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
23df97dc6dSFande Kong   }
24df97dc6dSFande Kong   PetscFunctionReturn(0);
25df97dc6dSFande Kong }
26df97dc6dSFande Kong 
274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
29df97dc6dSFande Kong {
30df97dc6dSFande Kong   PetscErrorCode ierr;
314222ddf1SHong Zhang   PetscInt       ii;
324222ddf1SHong Zhang   Mat_SeqAIJ     *aij;
33e4e71118SStefano Zampini   PetscBool      isseqaij;
345c66b693SKris Buschelman 
355c66b693SKris Buschelman   PetscFunctionBegin;
364222ddf1SHong Zhang   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
374222ddf1SHong Zhang   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
384222ddf1SHong Zhang 
39e4e71118SStefano Zampini   if (!mtype) {
40e4e71118SStefano Zampini     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
41e4e71118SStefano Zampini     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
42e4e71118SStefano Zampini   } else {
43e4e71118SStefano Zampini     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
44e4e71118SStefano Zampini   }
454222ddf1SHong Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,0);CHKERRQ(ierr);
464222ddf1SHong Zhang   aij  = (Mat_SeqAIJ*)(mat)->data;
474222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
484222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
494222ddf1SHong Zhang 
504222ddf1SHong Zhang   aij->i            = i;
514222ddf1SHong Zhang   aij->j            = j;
524222ddf1SHong Zhang   aij->a            = a;
534222ddf1SHong Zhang   aij->singlemalloc = PETSC_FALSE;
544222ddf1SHong Zhang   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
554222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
564222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
574222ddf1SHong Zhang 
584222ddf1SHong Zhang   for (ii=0; ii<m; ii++) {
594222ddf1SHong Zhang     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
6025023028SHong Zhang   }
614222ddf1SHong Zhang 
625c66b693SKris Buschelman   PetscFunctionReturn(0);
635c66b693SKris Buschelman }
641c24bd37SHong Zhang 
654222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
664222ddf1SHong Zhang {
674222ddf1SHong Zhang   PetscErrorCode      ierr;
684222ddf1SHong Zhang   Mat_Product         *product = C->product;
694222ddf1SHong Zhang   MatProductAlgorithm alg;
704222ddf1SHong Zhang   PetscBool           flg;
714222ddf1SHong Zhang 
724222ddf1SHong Zhang   PetscFunctionBegin;
734222ddf1SHong Zhang   if (product) {
744222ddf1SHong Zhang     alg = product->alg;
754222ddf1SHong Zhang   } else {
764222ddf1SHong Zhang     alg = "sorted";
774222ddf1SHong Zhang   }
784222ddf1SHong Zhang   /* sorted */
794222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
804222ddf1SHong Zhang   if (flg) {
814222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
824222ddf1SHong Zhang     PetscFunctionReturn(0);
834222ddf1SHong Zhang   }
844222ddf1SHong Zhang 
854222ddf1SHong Zhang   /* scalable */
864222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
874222ddf1SHong Zhang   if (flg) {
884222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
894222ddf1SHong Zhang     PetscFunctionReturn(0);
904222ddf1SHong Zhang   }
914222ddf1SHong Zhang 
924222ddf1SHong Zhang   /* scalable_fast */
934222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
944222ddf1SHong Zhang   if (flg) {
954222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
964222ddf1SHong Zhang     PetscFunctionReturn(0);
974222ddf1SHong Zhang   }
984222ddf1SHong Zhang 
994222ddf1SHong Zhang   /* heap */
1004222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
1014222ddf1SHong Zhang   if (flg) {
1024222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
1034222ddf1SHong Zhang     PetscFunctionReturn(0);
1044222ddf1SHong Zhang   }
1054222ddf1SHong Zhang 
1064222ddf1SHong Zhang   /* btheap */
1074222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
1084222ddf1SHong Zhang   if (flg) {
1094222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
1104222ddf1SHong Zhang     PetscFunctionReturn(0);
1114222ddf1SHong Zhang   }
1124222ddf1SHong Zhang 
1134222ddf1SHong Zhang   /* llcondensed */
1144222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
1154222ddf1SHong Zhang   if (flg) {
1164222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
1174222ddf1SHong Zhang     PetscFunctionReturn(0);
1184222ddf1SHong Zhang   }
1194222ddf1SHong Zhang 
1204222ddf1SHong Zhang   /* rowmerge */
1214222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
1224222ddf1SHong Zhang   if (flg) {
1234222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
1244222ddf1SHong Zhang     PetscFunctionReturn(0);
1254222ddf1SHong Zhang   }
1264222ddf1SHong Zhang 
1274222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1284222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
1294222ddf1SHong Zhang   if (flg) {
1304222ddf1SHong Zhang     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
1314222ddf1SHong Zhang     PetscFunctionReturn(0);
1324222ddf1SHong Zhang   }
1334222ddf1SHong Zhang #endif
1344222ddf1SHong Zhang 
1354222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1364222ddf1SHong Zhang   PetscFunctionReturn(0);
1374222ddf1SHong Zhang }
1384222ddf1SHong Zhang 
1394222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
140b561aa9dSHong Zhang {
141b561aa9dSHong Zhang   PetscErrorCode     ierr;
142b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
143971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
144c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
145b561aa9dSHong Zhang   PetscReal          afill;
146eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
147eca6b86aSHong Zhang   PetscTable         ta;
148fb908031SHong Zhang   PetscBT            lnkbt;
1490298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
150b561aa9dSHong Zhang 
151b561aa9dSHong Zhang   PetscFunctionBegin;
152bd958071SHong Zhang   /* Get ci and cj */
153bd958071SHong Zhang   /*---------------*/
154fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
155fb908031SHong Zhang   /* free space for accumulating nonzero column info */
156854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
157fb908031SHong Zhang   ci[0] = 0;
158fb908031SHong Zhang 
159fb908031SHong Zhang   /* create and initialize a linked list */
160c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
161c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
162eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
163eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
164eca6b86aSHong Zhang 
165eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
166fb908031SHong Zhang 
167fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
168f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1692205254eSKarl Rupp 
170fb908031SHong Zhang   current_space = free_space;
171fb908031SHong Zhang 
172fb908031SHong Zhang   /* Determine ci and cj */
173fb908031SHong Zhang   for (i=0; i<am; i++) {
174fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
175fb908031SHong Zhang     aj   = a->j + ai[i];
176fb908031SHong Zhang     for (j=0; j<anzi; j++) {
177fb908031SHong Zhang       brow = aj[j];
178fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
179fb908031SHong Zhang       bj   = b->j + bi[brow];
180fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
181fb908031SHong Zhang       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
182fb908031SHong Zhang     }
183fb908031SHong Zhang     cnzi = lnk[0];
184fb908031SHong Zhang 
185fb908031SHong Zhang     /* If free space is not available, make more free space */
186fb908031SHong Zhang     /* Double the amount of total space in the list */
187fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
188f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
189fb908031SHong Zhang       ndouble++;
190fb908031SHong Zhang     }
191fb908031SHong Zhang 
192fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
193fb908031SHong Zhang     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1942205254eSKarl Rupp 
195fb908031SHong Zhang     current_space->array           += cnzi;
196fb908031SHong Zhang     current_space->local_used      += cnzi;
197fb908031SHong Zhang     current_space->local_remaining -= cnzi;
1982205254eSKarl Rupp 
199fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
200fb908031SHong Zhang   }
201fb908031SHong Zhang 
202fb908031SHong Zhang   /* Column indices are in the list of free space */
203fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
204fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
205854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
206fb908031SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
207fb908031SHong Zhang   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
208b561aa9dSHong Zhang 
20926be0446SHong Zhang   /* put together the new symbolic matrix */
210e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
2114222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
21258c24d83SHong Zhang 
21358c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
21458c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2154222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
216aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
217e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
21858c24d83SHong Zhang   c->nonew   = 0;
2194222ddf1SHong Zhang 
2204222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2214222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2220b7e3e3dSHong Zhang 
2237212da7cSHong Zhang   /* set MatInfo */
2247212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
225f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2267212da7cSHong Zhang   c->maxnz                  = ci[am];
2277212da7cSHong Zhang   c->nz                     = ci[am];
2284222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2294222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2304222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2317212da7cSHong Zhang 
2327212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2337212da7cSHong Zhang   if (ci[am]) {
2344222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
2354222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
236f2b054eeSHong Zhang   } else {
2374222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
238be0fcf8dSHong Zhang   }
239f2b054eeSHong Zhang #endif
24058c24d83SHong Zhang   PetscFunctionReturn(0);
24158c24d83SHong Zhang }
242d50806bdSBarry Smith 
243df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
244d50806bdSBarry Smith {
245dfbe8321SBarry Smith   PetscErrorCode ierr;
246d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
247d50806bdSBarry Smith   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
248d50806bdSBarry Smith   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
249d50806bdSBarry Smith   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
25038baddfdSBarry Smith   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
251c58ca2e3SHong Zhang   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
252519eb980SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
253aa1aec99SHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
254aa1aec99SHong Zhang   PetscScalar    *ab_dense;
255*6718818eSStefano Zampini   PetscContainer cab_dense;
256d50806bdSBarry Smith 
257d50806bdSBarry Smith   PetscFunctionBegin;
258aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
259854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
260aa1aec99SHong Zhang     c->a      = ca;
261aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
262*6718818eSStefano Zampini   } else ca = c->a;
263*6718818eSStefano Zampini 
264*6718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
265*6718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
266*6718818eSStefano Zampini      that is hard to eradicate) */
267*6718818eSStefano Zampini   ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr);
268*6718818eSStefano Zampini   if (!cab_dense) {
269*6718818eSStefano Zampini     ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
270*6718818eSStefano Zampini     ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr);
271*6718818eSStefano Zampini     ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr);
272*6718818eSStefano Zampini     ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr);
273*6718818eSStefano Zampini     ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr);
274*6718818eSStefano Zampini     ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr);
275d90d86d0SMatthew G. Knepley   }
276*6718818eSStefano Zampini   ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr);
277*6718818eSStefano Zampini   ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr);
278aa1aec99SHong Zhang 
279c124e916SHong Zhang   /* clean old values in C */
280580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
281d50806bdSBarry Smith   /* Traverse A row-wise. */
282d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
283d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
284d50806bdSBarry Smith   for (i=0; i<am; i++) {
285d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
286d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
287519eb980SHong Zhang       brow = aj[j];
288d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
289d50806bdSBarry Smith       bjj  = bj + bi[brow];
290d50806bdSBarry Smith       baj  = ba + bi[brow];
291519eb980SHong Zhang       /* perform dense axpy */
29236ec6d2dSHong Zhang       valtmp = aa[j];
293519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
29436ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
295519eb980SHong Zhang       }
296519eb980SHong Zhang       flops += 2*bnzi;
297519eb980SHong Zhang     }
298c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
299c58ca2e3SHong Zhang 
300c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
301519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
302519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
303519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
304519eb980SHong Zhang     }
305519eb980SHong Zhang     flops += cnzi;
306519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
307519eb980SHong Zhang   }
308c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
309c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
310c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
311c58ca2e3SHong Zhang   PetscFunctionReturn(0);
312c58ca2e3SHong Zhang }
313c58ca2e3SHong Zhang 
31425023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
315c58ca2e3SHong Zhang {
316c58ca2e3SHong Zhang   PetscErrorCode ierr;
317c58ca2e3SHong Zhang   PetscLogDouble flops=0.0;
318c58ca2e3SHong Zhang   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
319c58ca2e3SHong Zhang   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
320c58ca2e3SHong Zhang   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
321c58ca2e3SHong Zhang   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
322c58ca2e3SHong Zhang   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
323c58ca2e3SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
32436ec6d2dSHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
325c58ca2e3SHong Zhang   PetscInt       nextb;
326c58ca2e3SHong Zhang 
327c58ca2e3SHong Zhang   PetscFunctionBegin;
328cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
329cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
330cf742fccSHong Zhang     c->a      = ca;
331cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
332cf742fccSHong Zhang   }
333cf742fccSHong Zhang 
334c58ca2e3SHong Zhang   /* clean old values in C */
335580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
336c58ca2e3SHong Zhang   /* Traverse A row-wise. */
337c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
338c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
339519eb980SHong Zhang   for (i=0; i<am; i++) {
340519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
341519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
342519eb980SHong Zhang     for (j=0; j<anzi; j++) {
343519eb980SHong Zhang       brow = aj[j];
344519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
345519eb980SHong Zhang       bjj  = bj + bi[brow];
346519eb980SHong Zhang       baj  = ba + bi[brow];
347519eb980SHong Zhang       /* perform sparse axpy */
34836ec6d2dSHong Zhang       valtmp = aa[j];
349c124e916SHong Zhang       nextb  = 0;
350c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
351c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
35236ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
353c124e916SHong Zhang         }
354d50806bdSBarry Smith       }
355d50806bdSBarry Smith       flops += 2*bnzi;
356d50806bdSBarry Smith     }
357519eb980SHong Zhang     aj += anzi; aa += anzi;
358519eb980SHong Zhang     cj += cnzi; ca += cnzi;
359d50806bdSBarry Smith   }
360c58ca2e3SHong Zhang 
361716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
362716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
363d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
364d50806bdSBarry Smith   PetscFunctionReturn(0);
365d50806bdSBarry Smith }
366bc011b1eSHong Zhang 
3674222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
36825296bd5SBarry Smith {
36925296bd5SBarry Smith   PetscErrorCode     ierr;
37025296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
37125296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3723c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
37325296bd5SBarry Smith   MatScalar          *ca;
37425296bd5SBarry Smith   PetscReal          afill;
375eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
376eca6b86aSHong Zhang   PetscTable         ta;
3770298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
37825296bd5SBarry Smith 
37925296bd5SBarry Smith   PetscFunctionBegin;
3803c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3813c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3823c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
383854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
3843c50cad2SHong Zhang   ci[0] = 0;
3853c50cad2SHong Zhang 
3863c50cad2SHong Zhang   /* create and initialize a linked list */
387c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
388c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
389eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
390eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
391eca6b86aSHong Zhang 
392eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
3933c50cad2SHong Zhang 
3943c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
395f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
3963c50cad2SHong Zhang   current_space = free_space;
3973c50cad2SHong Zhang 
3983c50cad2SHong Zhang   /* Determine ci and cj */
3993c50cad2SHong Zhang   for (i=0; i<am; i++) {
4003c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
4013c50cad2SHong Zhang     aj   = a->j + ai[i];
4023c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
4033c50cad2SHong Zhang       brow = aj[j];
4043c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
4053c50cad2SHong Zhang       bj   = b->j + bi[brow];
4063c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4073c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
4083c50cad2SHong Zhang     }
4093c50cad2SHong Zhang     cnzi = lnk[1];
4103c50cad2SHong Zhang 
4113c50cad2SHong Zhang     /* If free space is not available, make more free space */
4123c50cad2SHong Zhang     /* Double the amount of total space in the list */
4133c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
414f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
4153c50cad2SHong Zhang       ndouble++;
4163c50cad2SHong Zhang     }
4173c50cad2SHong Zhang 
4183c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4193c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4202205254eSKarl Rupp 
4213c50cad2SHong Zhang     current_space->array           += cnzi;
4223c50cad2SHong Zhang     current_space->local_used      += cnzi;
4233c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4242205254eSKarl Rupp 
4253c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
4263c50cad2SHong Zhang   }
4273c50cad2SHong Zhang 
4283c50cad2SHong Zhang   /* Column indices are in the list of free space */
4293c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4303c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
431854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
4323c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
4333c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
43425296bd5SBarry Smith 
43525296bd5SBarry Smith   /* Allocate space for ca */
436580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
43725296bd5SBarry Smith 
43825296bd5SBarry Smith   /* put together the new symbolic matrix */
439e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
4404222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
44125296bd5SBarry Smith 
44225296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
44325296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4444222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
44525296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
44625296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
44725296bd5SBarry Smith   c->nonew   = 0;
4482205254eSKarl Rupp 
4494222ddf1SHong Zhang   /* slower, less memory */
4504222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
45125296bd5SBarry Smith 
45225296bd5SBarry Smith   /* set MatInfo */
45325296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
45425296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
45525296bd5SBarry Smith   c->maxnz                     = ci[am];
45625296bd5SBarry Smith   c->nz                        = ci[am];
4574222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4584222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4594222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
46025296bd5SBarry Smith 
46125296bd5SBarry Smith #if defined(PETSC_USE_INFO)
46225296bd5SBarry Smith   if (ci[am]) {
4634222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
4644222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
46525296bd5SBarry Smith   } else {
4664222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
46725296bd5SBarry Smith   }
46825296bd5SBarry Smith #endif
46925296bd5SBarry Smith   PetscFunctionReturn(0);
47025296bd5SBarry Smith }
47125296bd5SBarry Smith 
4724222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
473e9e4536cSHong Zhang {
474e9e4536cSHong Zhang   PetscErrorCode     ierr;
475e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
476bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
47725c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
478e9e4536cSHong Zhang   MatScalar          *ca;
479e9e4536cSHong Zhang   PetscReal          afill;
480eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
481eca6b86aSHong Zhang   PetscTable         ta;
4820298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
483e9e4536cSHong Zhang 
484e9e4536cSHong Zhang   PetscFunctionBegin;
485bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
486bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
487bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
488854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
489bd958071SHong Zhang   ci[0] = 0;
490bd958071SHong Zhang 
491bd958071SHong Zhang   /* create and initialize a linked list */
492c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
493c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
494eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
495eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
496eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
497bd958071SHong Zhang 
498bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
499f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
500bd958071SHong Zhang   current_space = free_space;
501bd958071SHong Zhang 
502bd958071SHong Zhang   /* Determine ci and cj */
503bd958071SHong Zhang   for (i=0; i<am; i++) {
504bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
505bd958071SHong Zhang     aj   = a->j + ai[i];
506bd958071SHong Zhang     for (j=0; j<anzi; j++) {
507bd958071SHong Zhang       brow = aj[j];
508bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
509bd958071SHong Zhang       bj   = b->j + bi[brow];
510bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
511bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
512bd958071SHong Zhang     }
513bd958071SHong Zhang     cnzi = lnk[0];
514bd958071SHong Zhang 
515bd958071SHong Zhang     /* If free space is not available, make more free space */
516bd958071SHong Zhang     /* Double the amount of total space in the list */
517bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
518f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
519bd958071SHong Zhang       ndouble++;
520bd958071SHong Zhang     }
521bd958071SHong Zhang 
522bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
523bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
5242205254eSKarl Rupp 
525bd958071SHong Zhang     current_space->array           += cnzi;
526bd958071SHong Zhang     current_space->local_used      += cnzi;
527bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5282205254eSKarl Rupp 
529bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
530bd958071SHong Zhang   }
531bd958071SHong Zhang 
532bd958071SHong Zhang   /* Column indices are in the list of free space */
533bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
534bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
535854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
536bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
537bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
538e9e4536cSHong Zhang 
539e9e4536cSHong Zhang   /* Allocate space for ca */
540bd958071SHong Zhang   /*-----------------------*/
541580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
542e9e4536cSHong Zhang 
543e9e4536cSHong Zhang   /* put together the new symbolic matrix */
544e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
5454222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
546e9e4536cSHong Zhang 
547e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
548e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5494222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
550e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
551e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
552e9e4536cSHong Zhang   c->nonew   = 0;
5532205254eSKarl Rupp 
5544222ddf1SHong Zhang   /* slower, less memory */
5554222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
556e9e4536cSHong Zhang 
557e9e4536cSHong Zhang   /* set MatInfo */
558e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
559e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
560e9e4536cSHong Zhang   c->maxnz                     = ci[am];
561e9e4536cSHong Zhang   c->nz                        = ci[am];
5624222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5634222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5644222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
565e9e4536cSHong Zhang 
566e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
567e9e4536cSHong Zhang   if (ci[am]) {
5684222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
5694222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
570e9e4536cSHong Zhang   } else {
5714222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
572e9e4536cSHong Zhang   }
573e9e4536cSHong Zhang #endif
574e9e4536cSHong Zhang   PetscFunctionReturn(0);
575e9e4536cSHong Zhang }
576e9e4536cSHong Zhang 
5774222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
5780ced3a2bSJed Brown {
5790ced3a2bSJed Brown   PetscErrorCode     ierr;
5800ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
5810ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
5820ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
5830ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
5840ced3a2bSJed Brown   PetscReal          afill;
5850ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
5860298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
5870ced3a2bSJed Brown   PetscHeap          h;
5880ced3a2bSJed Brown 
5890ced3a2bSJed Brown   PetscFunctionBegin;
590cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
5910ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
5920ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
593854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
5940ced3a2bSJed Brown   ci[0] = 0;
5950ced3a2bSJed Brown 
5960ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
597f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
5980ced3a2bSJed Brown   current_space = free_space;
5990ced3a2bSJed Brown 
6000ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
601785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6020ced3a2bSJed Brown 
6030ced3a2bSJed Brown   /* Determine ci and cj */
6040ced3a2bSJed Brown   for (i=0; i<am; i++) {
6050ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6060ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6070ced3a2bSJed Brown     ci[i+1] = ci[i];
6080ced3a2bSJed Brown     /* Populate the min heap */
6090ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
6100ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
6110ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
6120ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
6130ced3a2bSJed Brown       }
6140ced3a2bSJed Brown     }
6150ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6160ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6170ced3a2bSJed Brown     while (j >= 0) {
6180ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
619f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6200ced3a2bSJed Brown         ndouble++;
6210ced3a2bSJed Brown       }
6220ced3a2bSJed Brown       *(current_space->array++) = col;
6230ced3a2bSJed Brown       current_space->local_used++;
6240ced3a2bSJed Brown       current_space->local_remaining--;
6250ced3a2bSJed Brown       ci[i+1]++;
6260ced3a2bSJed Brown 
6270ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6280ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
6290ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
6300ced3a2bSJed Brown         PetscInt j2,col2;
6310ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
6320ced3a2bSJed Brown         if (col2 != col) break;
6330ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
6340ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
6350ced3a2bSJed Brown       }
6360ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6370ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
6380ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6390ced3a2bSJed Brown     }
6400ced3a2bSJed Brown   }
6410ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6420ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
6430ced3a2bSJed Brown 
6440ced3a2bSJed Brown   /* Column indices are in the list of free space */
6450ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6460ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
647785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
6480ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
6490ced3a2bSJed Brown 
6500ced3a2bSJed Brown   /* put together the new symbolic matrix */
651e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
6524222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
6530ced3a2bSJed Brown 
6540ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6550ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6564222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
6570ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6580ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6590ced3a2bSJed Brown   c->nonew   = 0;
66026fbe8dcSKarl Rupp 
6614222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6620ced3a2bSJed Brown 
6630ced3a2bSJed Brown   /* set MatInfo */
6640ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6650ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6660ced3a2bSJed Brown   c->maxnz                     = ci[am];
6670ced3a2bSJed Brown   c->nz                        = ci[am];
6684222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6694222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6704222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6710ced3a2bSJed Brown 
6720ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6730ced3a2bSJed Brown   if (ci[am]) {
6744222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
6754222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
6760ced3a2bSJed Brown   } else {
6774222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
6780ced3a2bSJed Brown   }
6790ced3a2bSJed Brown #endif
6800ced3a2bSJed Brown   PetscFunctionReturn(0);
6810ced3a2bSJed Brown }
682e9e4536cSHong Zhang 
6834222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
6848a07c6f1SJed Brown {
6858a07c6f1SJed Brown   PetscErrorCode     ierr;
6868a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6878a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6888a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
6898a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6908a07c6f1SJed Brown   PetscReal          afill;
6918a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
6920298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6938a07c6f1SJed Brown   PetscHeap          h;
6948a07c6f1SJed Brown   PetscBT            bt;
6958a07c6f1SJed Brown 
6968a07c6f1SJed Brown   PetscFunctionBegin;
697cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
6988a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
6998a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
700854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
7018a07c6f1SJed Brown   ci[0] = 0;
7028a07c6f1SJed Brown 
7038a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
704f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
7052205254eSKarl Rupp 
7068a07c6f1SJed Brown   current_space = free_space;
7078a07c6f1SJed Brown 
7088a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
709785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
7108a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
7118a07c6f1SJed Brown 
7128a07c6f1SJed Brown   /* Determine ci and cj */
7138a07c6f1SJed Brown   for (i=0; i<am; i++) {
7148a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7158a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
7168a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7178a07c6f1SJed Brown     ci[i+1] = ci[i];
7188a07c6f1SJed Brown     /* Populate the min heap */
7198a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
7208a07c6f1SJed Brown       PetscInt brow = acol[j];
7218a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
7228a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7238a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7248a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7258a07c6f1SJed Brown           bb[j]++;
7268a07c6f1SJed Brown           break;
7278a07c6f1SJed Brown         }
7288a07c6f1SJed Brown       }
7298a07c6f1SJed Brown     }
7308a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7318a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7328a07c6f1SJed Brown     while (j >= 0) {
7338a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7340298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
735f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
7368a07c6f1SJed Brown         ndouble++;
7378a07c6f1SJed Brown       }
7388a07c6f1SJed Brown       *(current_space->array++) = col;
7398a07c6f1SJed Brown       current_space->local_used++;
7408a07c6f1SJed Brown       current_space->local_remaining--;
7418a07c6f1SJed Brown       ci[i+1]++;
7428a07c6f1SJed Brown 
7438a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7448a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
7458a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7468a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7478a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7488a07c6f1SJed Brown           bb[j]++;
7498a07c6f1SJed Brown           break;
7508a07c6f1SJed Brown         }
7518a07c6f1SJed Brown       }
7528a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7538a07c6f1SJed Brown     }
7548a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7558a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7568a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7578a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7588a07c6f1SJed Brown     }
7598a07c6f1SJed Brown   }
7608a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
7618a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7628a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7638a07c6f1SJed Brown 
7648a07c6f1SJed Brown   /* Column indices are in the list of free space */
7658a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7668a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
767785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7688a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7698a07c6f1SJed Brown 
7708a07c6f1SJed Brown   /* put together the new symbolic matrix */
771e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
7724222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
7738a07c6f1SJed Brown 
7748a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7758a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7764222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
7778a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7788a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7798a07c6f1SJed Brown   c->nonew   = 0;
78026fbe8dcSKarl Rupp 
7814222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7828a07c6f1SJed Brown 
7838a07c6f1SJed Brown   /* set MatInfo */
7848a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7858a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7868a07c6f1SJed Brown   c->maxnz                     = ci[am];
7878a07c6f1SJed Brown   c->nz                        = ci[am];
7884222ddf1SHong Zhang   C->info.mallocs           = ndouble;
7894222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
7904222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
7918a07c6f1SJed Brown 
7928a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7938a07c6f1SJed Brown   if (ci[am]) {
7944222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
7954222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7968a07c6f1SJed Brown   } else {
7974222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
7988a07c6f1SJed Brown   }
7998a07c6f1SJed Brown #endif
8008a07c6f1SJed Brown   PetscFunctionReturn(0);
8018a07c6f1SJed Brown }
8028a07c6f1SJed Brown 
803d7ed1a4dSandi selinger 
8044222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
805d7ed1a4dSandi selinger {
806d7ed1a4dSandi selinger   PetscErrorCode     ierr;
807d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
808d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
809d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
810d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
811d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
812d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
813d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
814d7ed1a4dSandi selinger   PetscInt           window[8];
815d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
816d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
817d7ed1a4dSandi selinger   PetscReal          afill;
818f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
8197660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
820d7ed1a4dSandi selinger 
821d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
822d7ed1a4dSandi selinger              Because of the way virtual memory works,
823d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
824d7ed1a4dSandi selinger   PetscFunctionBegin;
825d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
826d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
827d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
828d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
829d7ed1a4dSandi selinger     a_rownnz = 0;
830d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
831d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
832d7ed1a4dSandi selinger       if (a_rownnz > bn) {
833d7ed1a4dSandi selinger         a_rownnz = bn;
834d7ed1a4dSandi selinger         break;
835d7ed1a4dSandi selinger       }
836d7ed1a4dSandi selinger     }
837d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
838d7ed1a4dSandi selinger   }
839d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
840d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
841f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
842f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
843d7ed1a4dSandi selinger 
8447660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8457660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
846d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
847d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
848d7ed1a4dSandi selinger 
849d7ed1a4dSandi selinger   ci_nnz       = 0;
850d7ed1a4dSandi selinger   ci[0]        = 0;
851d7ed1a4dSandi selinger   worki_L1[0]  = 0;
852d7ed1a4dSandi selinger   worki_L2[0]  = 0;
853d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
854d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
855d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
856d7ed1a4dSandi selinger     rowsleft             = anzi;
857d7ed1a4dSandi selinger     inputcol_L1          = acol;
858d7ed1a4dSandi selinger     L2_nnz               = 0;
8597660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8607660c4dbSandi selinger     worki_L2[1]          = 0;
86108fe1b3cSKarl Rupp     outputi_nnz          = 0;
862d7ed1a4dSandi selinger 
863d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
864d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
865d7ed1a4dSandi selinger       c_maxmem *= 2;
866d7ed1a4dSandi selinger       ndouble++;
867d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
868d7ed1a4dSandi selinger     }
869d7ed1a4dSandi selinger 
870d7ed1a4dSandi selinger     while (rowsleft) {
871d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
872d7ed1a4dSandi selinger       L1_nrows    = 0;
8737660c4dbSandi selinger       L1_nnz      = 0;
874d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
875d7ed1a4dSandi selinger       inputi      = bi;
876d7ed1a4dSandi selinger       inputj      = bj;
877d7ed1a4dSandi selinger 
878d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
879d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
880f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
881d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
882d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
883d7ed1a4dSandi selinger          window_min  = bn;                                                   \
8847660c4dbSandi selinger          outputi_nnz = 0;                                                    \
8857660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
886d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
887d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
888d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
889d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
890d7ed1a4dSandi selinger          }                                                                   \
891d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
892d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
893d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
894d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
895d7ed1a4dSandi selinger            window_min = bn;                                                  \
896d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
897d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
898d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
899d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
900d7ed1a4dSandi selinger              }                                                               \
901d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
902d7ed1a4dSandi selinger            }                                                                 \
903d7ed1a4dSandi selinger          }
904d7ed1a4dSandi selinger 
905d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
906d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
907d7ed1a4dSandi selinger       while (L1_rowsleft) {
9087660c4dbSandi selinger         outputi_nnz = 0;
9097660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
9107660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
9117660c4dbSandi selinger 
912d7ed1a4dSandi selinger         switch (L1_rowsleft) {
913d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
914d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
915d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
916d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
917d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
918d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
919d7ed1a4dSandi selinger                  break;
920d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
921d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
922d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
923d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
924d7ed1a4dSandi selinger                  break;
925d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
926d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
927d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
928d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
929d7ed1a4dSandi selinger                  break;
930d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
931d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
932d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
933d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
934d7ed1a4dSandi selinger                  break;
935d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
936d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
937d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
938d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
939d7ed1a4dSandi selinger                  break;
940d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
941d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
942d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
943d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
944d7ed1a4dSandi selinger                  break;
945d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
946d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
947d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
948d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
949d7ed1a4dSandi selinger                  break;
950d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
951d7ed1a4dSandi selinger                  inputcol    += 8;
952d7ed1a4dSandi selinger                  rowsleft    -= 8;
953d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
954d7ed1a4dSandi selinger                  break;
955d7ed1a4dSandi selinger         }
956d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9577660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9587660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
959d7ed1a4dSandi selinger       }
960d7ed1a4dSandi selinger 
961d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
962d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
963d7ed1a4dSandi selinger       if (anzi > 8) {
964d7ed1a4dSandi selinger         inputi      = worki_L1;
965d7ed1a4dSandi selinger         inputj      = workj_L1;
966d7ed1a4dSandi selinger         inputcol    = workcol;
967d7ed1a4dSandi selinger         outputi_nnz = 0;
968d7ed1a4dSandi selinger 
969d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
970d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
971d7ed1a4dSandi selinger 
972d7ed1a4dSandi selinger         switch (L1_nrows) {
973d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
974d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
975d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
976d7ed1a4dSandi selinger                  break;
977d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
978d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
979d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
980d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
981d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
982d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
983d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
984d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
985d7ed1a4dSandi selinger         }
986d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
987d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
988d7ed1a4dSandi selinger 
9897660c4dbSandi selinger         /************************ L E V E L  3 **********************/
9907660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
991d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
992d7ed1a4dSandi selinger           inputi      = worki_L2;
993d7ed1a4dSandi selinger           inputj      = workj_L2;
994d7ed1a4dSandi selinger           inputcol    = workcol;
995d7ed1a4dSandi selinger           outputi_nnz = 0;
996f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
997d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
998d7ed1a4dSandi selinger           switch (L2_nrows) {
999d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1000d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1001d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1002d7ed1a4dSandi selinger                    break;
1003d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1004d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1005d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1006d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1007d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1008d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1009d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1010d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1011d7ed1a4dSandi selinger           }
1012d7ed1a4dSandi selinger           L2_nrows    = 1;
10137660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10147660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10157660c4dbSandi selinger           /* Copy to workj_L2 */
1016d7ed1a4dSandi selinger           if (rowsleft) {
10177660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1018d7ed1a4dSandi selinger           }
1019d7ed1a4dSandi selinger         }
1020d7ed1a4dSandi selinger       }
1021d7ed1a4dSandi selinger     }  /* while (rowsleft) */
1022d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1023d7ed1a4dSandi selinger 
1024d7ed1a4dSandi selinger     /* terminate current row */
1025d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1026d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
1027d7ed1a4dSandi selinger   }
1028d7ed1a4dSandi selinger 
1029d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
1030e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
10314222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1032d7ed1a4dSandi selinger 
1033d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1034d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10354222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1036d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1037d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1038d7ed1a4dSandi selinger   c->nonew   = 0;
1039d7ed1a4dSandi selinger 
10404222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1041d7ed1a4dSandi selinger 
1042d7ed1a4dSandi selinger   /* set MatInfo */
1043d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1044d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
1045d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
1046d7ed1a4dSandi selinger   c->nz                        = ci[am];
10474222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10484222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10494222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1050d7ed1a4dSandi selinger 
1051d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1052d7ed1a4dSandi selinger   if (ci[am]) {
10534222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
10544222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1055d7ed1a4dSandi selinger   } else {
10564222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1057d7ed1a4dSandi selinger   }
1058d7ed1a4dSandi selinger #endif
1059d7ed1a4dSandi selinger 
1060d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1061d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1062d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1063f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1064d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1065d7ed1a4dSandi selinger }
1066d7ed1a4dSandi selinger 
1067cd093f1dSJed Brown /* concatenate unique entries and then sort */
10684222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1069cd093f1dSJed Brown {
1070cd093f1dSJed Brown   PetscErrorCode ierr;
1071cd093f1dSJed Brown   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1072cd093f1dSJed Brown   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1073cd093f1dSJed Brown   PetscInt       *ci,*cj;
1074cd093f1dSJed Brown   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1075cd093f1dSJed Brown   PetscReal      afill;
1076cd093f1dSJed Brown   PetscInt       i,j,ndouble = 0;
1077cd093f1dSJed Brown   PetscSegBuffer seg,segrow;
1078cd093f1dSJed Brown   char           *seen;
1079cd093f1dSJed Brown 
1080cd093f1dSJed Brown   PetscFunctionBegin;
1081854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1082cd093f1dSJed Brown   ci[0] = 0;
1083cd093f1dSJed Brown 
1084cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1085cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1086cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1087580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1088cd093f1dSJed Brown 
1089cd093f1dSJed Brown   /* Determine ci and cj */
1090cd093f1dSJed Brown   for (i=0; i<am; i++) {
1091cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1092cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1093cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1094cd093f1dSJed Brown     /* Pack segrow */
1095cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1096cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1097cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
1098cd093f1dSJed Brown         PetscInt bcol = bj[k];
1099cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1100cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1101cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1102cd093f1dSJed Brown           *slot = bcol;
1103cd093f1dSJed Brown           seen[bcol] = 1;
1104cd093f1dSJed Brown           packlen++;
1105cd093f1dSJed Brown         }
1106cd093f1dSJed Brown       }
1107cd093f1dSJed Brown     }
1108cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1109cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1110cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1111cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1112cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1113cd093f1dSJed Brown   }
1114cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1115cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1116cd093f1dSJed Brown 
1117cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1118cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1119cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1120cd093f1dSJed Brown 
1121cd093f1dSJed Brown   /* put together the new symbolic matrix */
1122e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
11234222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1124cd093f1dSJed Brown 
1125cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1126cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11274222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1128cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1129cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1130cd093f1dSJed Brown   c->nonew   = 0;
1131cd093f1dSJed Brown 
11324222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1133cd093f1dSJed Brown 
1134cd093f1dSJed Brown   /* set MatInfo */
1135cd093f1dSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1136cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1137cd093f1dSJed Brown   c->maxnz                     = ci[am];
1138cd093f1dSJed Brown   c->nz                        = ci[am];
11394222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11404222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11414222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1142cd093f1dSJed Brown 
1143cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1144cd093f1dSJed Brown   if (ci[am]) {
11454222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
11464222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1147cd093f1dSJed Brown   } else {
11484222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1149cd093f1dSJed Brown   }
1150cd093f1dSJed Brown #endif
1151cd093f1dSJed Brown   PetscFunctionReturn(0);
1152cd093f1dSJed Brown }
1153cd093f1dSJed Brown 
1154*6718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
11552128a86cSHong Zhang {
11562128a86cSHong Zhang   PetscErrorCode      ierr;
1157*6718818eSStefano Zampini   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
11582128a86cSHong Zhang 
11592128a86cSHong Zhang   PetscFunctionBegin;
116040192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
116140192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
116240192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
116340192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
11642128a86cSHong Zhang   PetscFunctionReturn(0);
11652128a86cSHong Zhang }
11662128a86cSHong Zhang 
11674222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1168bc011b1eSHong Zhang {
11695df89d91SHong Zhang   PetscErrorCode      ierr;
117081d82fe4SHong Zhang   Mat                 Bt;
117181d82fe4SHong Zhang   PetscInt            *bti,*btj;
117240192850SHong Zhang   Mat_MatMatTransMult *abt;
11734222ddf1SHong Zhang   Mat_Product         *product = C->product;
1174*6718818eSStefano Zampini   char                *alg;
1175d2853540SHong Zhang 
117681d82fe4SHong Zhang   PetscFunctionBegin;
1177*6718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1178*6718818eSStefano Zampini   if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1179*6718818eSStefano Zampini 
118081d82fe4SHong Zhang   /* create symbolic Bt */
118181d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
11820298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
118333d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
118404b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
118581d82fe4SHong Zhang 
118681d82fe4SHong Zhang   /* get symbolic C=A*Bt */
1187*6718818eSStefano Zampini   ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr);
11884222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
118981d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
11904222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
1191*6718818eSStefano Zampini   ierr = PetscFree(alg);CHKERRQ(ierr);
119281d82fe4SHong Zhang 
11932128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1194b00a9115SJed Brown   ierr = PetscNew(&abt);CHKERRQ(ierr);
11952128a86cSHong Zhang 
1196*6718818eSStefano Zampini   product->data    = abt;
1197*6718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1198*6718818eSStefano Zampini 
11994222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12002128a86cSHong Zhang 
12014222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12024222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
120340192850SHong Zhang   if (abt->usecoloring) {
1204b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1205b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1206335efc43SPeter Brune     MatColoring          coloring;
12072128a86cSHong Zhang     ISColoring           iscoloring;
12082128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
1209e8354b3eSHong Zhang 
12104222ddf1SHong Zhang     /* inode causes memory problem */
12114222ddf1SHong Zhang     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
12124222ddf1SHong Zhang 
12134222ddf1SHong Zhang     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1214335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1215335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1216335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1217335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1218335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
12194222ddf1SHong Zhang     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
12202205254eSKarl Rupp 
122140192850SHong Zhang     abt->matcoloring = matcoloring;
12222205254eSKarl Rupp 
12232128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
12242128a86cSHong Zhang 
12252128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12262128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
12272128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12282128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
12290298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
12302205254eSKarl Rupp 
12312128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
123240192850SHong Zhang     abt->Bt_den         = Bt_dense;
12332128a86cSHong Zhang 
12342128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
12352128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12362128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
12370298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
12382205254eSKarl Rupp 
12392128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
124040192850SHong Zhang     abt->ABt_den  = C_dense;
1241f94ccd6cSHong Zhang 
1242f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12431ce71dffSSatish Balay     {
12444222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
12454222ddf1SHong Zhang       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
12461ce71dffSSatish Balay     }
1247f94ccd6cSHong Zhang #endif
12482128a86cSHong Zhang   }
1249e99be685SHong Zhang   /* clean up */
1250e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1251e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12525df89d91SHong Zhang   PetscFunctionReturn(0);
12535df89d91SHong Zhang }
12545df89d91SHong Zhang 
12556fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12565df89d91SHong Zhang {
12575df89d91SHong Zhang   PetscErrorCode      ierr;
12585df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1259e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
126081d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12615df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1262aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1263*6718818eSStefano Zampini   Mat_MatMatTransMult *abt;
1264*6718818eSStefano Zampini   Mat_Product         *product = C->product;
12655df89d91SHong Zhang 
12665df89d91SHong Zhang   PetscFunctionBegin;
1267*6718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1268*6718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
1269*6718818eSStefano Zampini   if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
127058ed3ceaSHong Zhang   /* clear old values in C */
127158ed3ceaSHong Zhang   if (!c->a) {
1272580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
127358ed3ceaSHong Zhang     c->a      = ca;
127458ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
127558ed3ceaSHong Zhang   } else {
127658ed3ceaSHong Zhang     ca =  c->a;
1277580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
127858ed3ceaSHong Zhang   }
127958ed3ceaSHong Zhang 
128040192850SHong Zhang   if (abt->usecoloring) {
128140192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
128240192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1283c8db22f6SHong Zhang 
1284b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
128540192850SHong Zhang     Bt_dense = abt->Bt_den;
1286b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1287c8db22f6SHong Zhang 
1288c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
12892128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1290c8db22f6SHong Zhang 
12912128a86cSHong Zhang     /* Recover C from C_dense */
1292b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1293c8db22f6SHong Zhang     PetscFunctionReturn(0);
1294c8db22f6SHong Zhang   }
1295c8db22f6SHong Zhang 
12961fa1209cSHong Zhang   for (i=0; i<cm; i++) {
129781d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
129881d82fe4SHong Zhang     acol = aj + ai[i];
12996973769bSHong Zhang     aval = aa + ai[i];
13001fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
13011fa1209cSHong Zhang     ccol = cj + ci[i];
13026973769bSHong Zhang     cval = ca + ci[i];
13031fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
130481d82fe4SHong Zhang       brow = ccol[j];
130581d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
130681d82fe4SHong Zhang       bcol = bj + bi[brow];
13076973769bSHong Zhang       bval = ba + bi[brow];
13086973769bSHong Zhang 
130981d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
131081d82fe4SHong Zhang       nexta = 0; nextb = 0;
131181d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
13127b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
131381d82fe4SHong Zhang         if (nexta == anzi) break;
13147b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
131581d82fe4SHong Zhang         if (nextb == bnzj) break;
131681d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13176973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
131881d82fe4SHong Zhang           nexta++; nextb++;
131981d82fe4SHong Zhang           flops += 2;
13201fa1209cSHong Zhang         }
13211fa1209cSHong Zhang       }
132281d82fe4SHong Zhang     }
132381d82fe4SHong Zhang   }
132481d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
132581d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
132681d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
13275df89d91SHong Zhang   PetscFunctionReturn(0);
13285df89d91SHong Zhang }
13295df89d91SHong Zhang 
1330*6718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
13316d373c3eSHong Zhang {
13326d373c3eSHong Zhang   PetscErrorCode      ierr;
1333*6718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
13346d373c3eSHong Zhang 
13356d373c3eSHong Zhang   PetscFunctionBegin;
13366d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
1337*6718818eSStefano Zampini   if (atb->destroy) {
1338*6718818eSStefano Zampini     ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr);
13396473ade5SStefano Zampini   }
13406d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
13416d373c3eSHong Zhang   PetscFunctionReturn(0);
13426d373c3eSHong Zhang }
13436d373c3eSHong Zhang 
13444222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
13455df89d91SHong Zhang {
1346bc011b1eSHong Zhang   PetscErrorCode      ierr;
1347bc011b1eSHong Zhang   Mat                 At;
134838baddfdSBarry Smith   PetscInt            *ati,*atj;
13494222ddf1SHong Zhang   Mat_Product         *product = C->product;
13504222ddf1SHong Zhang   MatProductAlgorithm alg;
13514222ddf1SHong Zhang   PetscBool           flg;
1352bc011b1eSHong Zhang 
1353bc011b1eSHong Zhang   PetscFunctionBegin;
13544222ddf1SHong Zhang   if (product) {
13554222ddf1SHong Zhang     alg = product->alg;
13564222ddf1SHong Zhang   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"!product, not supported yet");
13574222ddf1SHong Zhang 
13584222ddf1SHong Zhang   /* outerproduct */
13594222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"outerproduct",&flg);CHKERRQ(ierr);
13604222ddf1SHong Zhang   if (flg) {
1361bc011b1eSHong Zhang     /* create symbolic At */
1362bc011b1eSHong Zhang     ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13630298fd71SBarry Smith     ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
136433d57670SJed Brown     ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
136504b858e0SBarry Smith     ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1366bc011b1eSHong Zhang 
1367bc011b1eSHong Zhang     /* get symbolic C=At*B */
13687a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1369bc011b1eSHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1370bc011b1eSHong Zhang 
1371bc011b1eSHong Zhang     /* clean up */
13726bf464f9SBarry Smith     ierr = MatDestroy(&At);CHKERRQ(ierr);
1373bc011b1eSHong Zhang     ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13746d373c3eSHong Zhang 
13754222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
13767a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr);
13774222ddf1SHong Zhang     PetscFunctionReturn(0);
13784222ddf1SHong Zhang   }
13794222ddf1SHong Zhang 
13804222ddf1SHong Zhang   /* matmatmult */
13814222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"at*b",&flg);CHKERRQ(ierr);
13824222ddf1SHong Zhang   if (flg) {
13834222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
13844222ddf1SHong Zhang 
1385*6718818eSStefano Zampini     if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
13864222ddf1SHong Zhang     ierr = PetscNew(&atb);CHKERRQ(ierr);
13874222ddf1SHong Zhang     ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
13887a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
13894222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1390*6718818eSStefano Zampini     ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr);
1391*6718818eSStefano Zampini     product->data    = atb;
1392*6718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
13934222ddf1SHong Zhang     atb->At          = At;
13944222ddf1SHong Zhang     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
13954222ddf1SHong Zhang 
13964222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
13974222ddf1SHong Zhang     PetscFunctionReturn(0);
13984222ddf1SHong Zhang   }
13994222ddf1SHong Zhang 
14004222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1401bc011b1eSHong Zhang   PetscFunctionReturn(0);
1402bc011b1eSHong Zhang }
1403bc011b1eSHong Zhang 
140475648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1405bc011b1eSHong Zhang {
1406bc011b1eSHong Zhang   PetscErrorCode ierr;
14070fbc74f4SHong Zhang   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1408d0f46423SBarry Smith   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1409d0f46423SBarry Smith   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1410d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1411aa1aec99SHong Zhang   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1412bc011b1eSHong Zhang 
1413bc011b1eSHong Zhang   PetscFunctionBegin;
1414aa1aec99SHong Zhang   if (!c->a) {
1415580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
14162205254eSKarl Rupp 
1417aa1aec99SHong Zhang     c->a      = ca;
1418aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1419aa1aec99SHong Zhang   } else {
1420aa1aec99SHong Zhang     ca   = c->a;
1421580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1422aa1aec99SHong Zhang   }
1423bc011b1eSHong Zhang 
1424bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1425bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1426bc011b1eSHong Zhang     bj   = b->j + bi[i];
1427bc011b1eSHong Zhang     ba   = b->a + bi[i];
1428bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1429bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1430bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1431bc011b1eSHong Zhang       nextb = 0;
14320fbc74f4SHong Zhang       crow  = *aj++;
1433bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1434bc011b1eSHong Zhang       caj   = ca + ci[crow];
1435bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1436bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14370fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14380fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1439bc011b1eSHong Zhang           nextb++;
1440bc011b1eSHong Zhang         }
1441bc011b1eSHong Zhang       }
1442bc011b1eSHong Zhang       flops += 2*bnzi;
14430fbc74f4SHong Zhang       aa++;
1444bc011b1eSHong Zhang     }
1445bc011b1eSHong Zhang   }
1446bc011b1eSHong Zhang 
1447bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1448bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1449bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1450bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1451bc011b1eSHong Zhang   PetscFunctionReturn(0);
1452bc011b1eSHong Zhang }
14539123193aSHong Zhang 
14544222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
14559123193aSHong Zhang {
14569123193aSHong Zhang   PetscErrorCode ierr;
14579123193aSHong Zhang 
14589123193aSHong Zhang   PetscFunctionBegin;
14595a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14602205254eSKarl Rupp 
14614222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14629123193aSHong Zhang   PetscFunctionReturn(0);
14639123193aSHong Zhang }
14649123193aSHong Zhang 
1465*6718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
14669123193aSHong Zhang {
1467f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1468612438f1SStefano Zampini   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
14691ca9667aSStefano Zampini   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
14709123193aSHong Zhang   PetscErrorCode    ierr;
14711ca9667aSStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1472a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1473daf57211SHong Zhang   const PetscInt    *aj;
1474612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
14751ca9667aSStefano Zampini   PetscInt          clda=cd->lda;
14761ca9667aSStefano Zampini   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
14779123193aSHong Zhang 
14789123193aSHong Zhang   PetscFunctionBegin;
1479f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1480a4af7ca8SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
14818c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1482a4af7ca8SStefano Zampini   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1483f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
14841ca9667aSStefano Zampini   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
14851ca9667aSStefano Zampini   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
14861ca9667aSStefano Zampini     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1487f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1488f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1489f32d5d43SBarry Smith       aj = a->j + a->i[i];
1490a4af7ca8SStefano Zampini       aa = av + a->i[i];
1491f32d5d43SBarry Smith       for (j=0; j<n; j++) {
14921ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
14931ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1494730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1495730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1496730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1497730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
14989123193aSHong Zhang       }
149987753ddeSHong Zhang       c1[i] += r1;
150087753ddeSHong Zhang       c2[i] += r2;
150187753ddeSHong Zhang       c3[i] += r3;
150287753ddeSHong Zhang       c4[i] += r4;
1503f32d5d43SBarry Smith     }
1504730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1505730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1506f32d5d43SBarry Smith   }
1507f32d5d43SBarry Smith   for (; col<cn; col++) {   /* over extra columns of C */
1508f32d5d43SBarry Smith     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1509f32d5d43SBarry Smith       r1 = 0.0;
1510f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1511f32d5d43SBarry Smith       aj = a->j + a->i[i];
1512a4af7ca8SStefano Zampini       aa = av + a->i[i];
1513f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1514730858b9SHong Zhang         r1 += aa[j]*b1[aj[j]];
1515f32d5d43SBarry Smith       }
151687753ddeSHong Zhang       c1[i] += r1;
1517f32d5d43SBarry Smith     }
1518f32d5d43SBarry Smith     b1 += bm;
15191ca9667aSStefano Zampini     c1 += clda;
1520f32d5d43SBarry Smith   }
1521dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
15228c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1523a4af7ca8SStefano Zampini   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1524a4af7ca8SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1525f32d5d43SBarry Smith   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1526f32d5d43SBarry Smith   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1527f32d5d43SBarry Smith   PetscFunctionReturn(0);
1528f32d5d43SBarry Smith }
1529f32d5d43SBarry Smith 
153087753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1531f32d5d43SBarry Smith {
1532f32d5d43SBarry Smith   PetscErrorCode ierr;
1533f32d5d43SBarry Smith 
1534f32d5d43SBarry Smith   PetscFunctionBegin;
153587753ddeSHong Zhang   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
153687753ddeSHong Zhang   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
153787753ddeSHong Zhang   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
15384324174eSBarry Smith 
153987753ddeSHong Zhang   ierr = MatZeroEntries(C);CHKERRQ(ierr);
154087753ddeSHong Zhang   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C);CHKERRQ(ierr);
15419123193aSHong Zhang   PetscFunctionReturn(0);
15429123193aSHong Zhang }
1543b1683b59SHong Zhang 
15444222ddf1SHong Zhang /* ------------------------------------------------------- */
15454222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
15464222ddf1SHong Zhang {
15474222ddf1SHong Zhang   PetscFunctionBegin;
15484222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
15494222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
15504222ddf1SHong Zhang   PetscFunctionReturn(0);
15514222ddf1SHong Zhang }
15524222ddf1SHong Zhang 
1553*6718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1554*6718818eSStefano Zampini 
15554222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
15564222ddf1SHong Zhang {
15574222ddf1SHong Zhang   PetscFunctionBegin;
1558*6718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
15594222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1560*6718818eSStefano Zampini   PetscFunctionReturn(0);
1561*6718818eSStefano Zampini }
1562*6718818eSStefano Zampini 
1563*6718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1564*6718818eSStefano Zampini {
1565*6718818eSStefano Zampini   PetscFunctionBegin;
1566*6718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1567*6718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
15684222ddf1SHong Zhang   PetscFunctionReturn(0);
15694222ddf1SHong Zhang }
15704222ddf1SHong Zhang 
15714222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
15724222ddf1SHong Zhang {
15734222ddf1SHong Zhang   PetscErrorCode ierr;
15744222ddf1SHong Zhang   Mat_Product    *product = C->product;
15754222ddf1SHong Zhang 
15764222ddf1SHong Zhang   PetscFunctionBegin;
15774222ddf1SHong Zhang   switch (product->type) {
15784222ddf1SHong Zhang   case MATPRODUCT_AB:
15794222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
15804222ddf1SHong Zhang     break;
15814222ddf1SHong Zhang   case MATPRODUCT_AtB:
15824222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
15834222ddf1SHong Zhang     break;
1584*6718818eSStefano Zampini   case MATPRODUCT_ABt:
1585*6718818eSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr);
15864222ddf1SHong Zhang     break;
15874222ddf1SHong Zhang   default:
1588*6718818eSStefano Zampini     break;
15894222ddf1SHong Zhang   }
15904222ddf1SHong Zhang   PetscFunctionReturn(0);
15914222ddf1SHong Zhang }
15924222ddf1SHong Zhang /* ------------------------------------------------------- */
15934222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
15944222ddf1SHong Zhang {
15954222ddf1SHong Zhang   PetscErrorCode ierr;
15964222ddf1SHong Zhang   Mat_Product    *product = C->product;
15974222ddf1SHong Zhang   Mat            A = product->A;
15984222ddf1SHong Zhang   PetscBool      baij;
15994222ddf1SHong Zhang 
16004222ddf1SHong Zhang   PetscFunctionBegin;
16014222ddf1SHong Zhang   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
16024222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
16034222ddf1SHong Zhang     PetscBool sbaij;
16044222ddf1SHong Zhang     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
16054222ddf1SHong Zhang     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
16064222ddf1SHong Zhang 
16074222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
16084222ddf1SHong Zhang   } else { /* A is seqbaij */
16094222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
16104222ddf1SHong Zhang   }
16114222ddf1SHong Zhang 
16124222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16134222ddf1SHong Zhang   PetscFunctionReturn(0);
16144222ddf1SHong Zhang }
16154222ddf1SHong Zhang 
16164222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
16174222ddf1SHong Zhang {
16184222ddf1SHong Zhang   PetscErrorCode ierr;
16194222ddf1SHong Zhang   Mat_Product    *product = C->product;
16204222ddf1SHong Zhang 
16214222ddf1SHong Zhang   PetscFunctionBegin;
1622*6718818eSStefano Zampini   MatCheckProduct(C,1);
1623*6718818eSStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1624*6718818eSStefano Zampini   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
16254222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1626*6718818eSStefano Zampini   }
16274222ddf1SHong Zhang   PetscFunctionReturn(0);
16284222ddf1SHong Zhang }
1629*6718818eSStefano Zampini 
16304222ddf1SHong Zhang /* ------------------------------------------------------- */
16314222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
16324222ddf1SHong Zhang {
16334222ddf1SHong Zhang   PetscFunctionBegin;
16344222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
16354222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16364222ddf1SHong Zhang   PetscFunctionReturn(0);
16374222ddf1SHong Zhang }
16384222ddf1SHong Zhang 
16394222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
16404222ddf1SHong Zhang {
16414222ddf1SHong Zhang   PetscErrorCode ierr;
16424222ddf1SHong Zhang   Mat_Product    *product = C->product;
16434222ddf1SHong Zhang 
16444222ddf1SHong Zhang   PetscFunctionBegin;
16454222ddf1SHong Zhang   if (product->type == MATPRODUCT_AB) {
16464222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
1647*6718818eSStefano Zampini   }
16484222ddf1SHong Zhang   PetscFunctionReturn(0);
16494222ddf1SHong Zhang }
16504222ddf1SHong Zhang /* ------------------------------------------------------- */
16514222ddf1SHong Zhang 
1652b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1653c8db22f6SHong Zhang {
1654c8db22f6SHong Zhang   PetscErrorCode ierr;
16552f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
16562f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
16572f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
16582f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
16592f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
16602f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1661c8db22f6SHong Zhang 
1662c8db22f6SHong Zhang   PetscFunctionBegin;
16632f5f1f90SHong Zhang   btval_den=btdense->v;
1664580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
16652f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
16662f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
16672f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1668d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
16692f5f1f90SHong Zhang       btcol = bj + bi[col];
16702f5f1f90SHong Zhang       btval = ba + bi[col];
16712f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1672d2853540SHong Zhang       for (j=0; j<anz; j++) {
16732f5f1f90SHong Zhang         brow            = btcol[j];
16742f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1675c8db22f6SHong Zhang       }
1676c8db22f6SHong Zhang     }
16772f5f1f90SHong Zhang     btval_den += m;
1678c8db22f6SHong Zhang   }
1679c8db22f6SHong Zhang   PetscFunctionReturn(0);
1680c8db22f6SHong Zhang }
1681c8db22f6SHong Zhang 
1682b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
16838972f759SHong Zhang {
16848972f759SHong Zhang   PetscErrorCode    ierr;
1685b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
16861683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
16871683a169SBarry Smith   PetscScalar       *ca=csp->a;
1688f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1689e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1690077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1691077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
16928972f759SHong Zhang 
16938972f759SHong Zhang   PetscFunctionBegin;
16941683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1695f99a636bSHong Zhang 
1696077f23c2SHong Zhang   if (brows > 0) {
1697077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1698980ae229SHong Zhang     lstart = matcoloring->lstart;
1699580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1700eeb4fd02SHong Zhang 
1701077f23c2SHong Zhang     row_end = brows;
1702eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1703077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1704077f23c2SHong Zhang       ca_den_ptr = ca_den;
1705980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1706eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1707eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1708eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1709eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1710eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1711eeb4fd02SHong Zhang             lstart[k] = l;
1712eeb4fd02SHong Zhang             break;
1713eeb4fd02SHong Zhang           } else {
1714077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1715eeb4fd02SHong Zhang           }
1716eeb4fd02SHong Zhang         }
1717077f23c2SHong Zhang         ca_den_ptr += m;
1718eeb4fd02SHong Zhang       }
1719077f23c2SHong Zhang       row_end += brows;
1720eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1721eeb4fd02SHong Zhang     }
1722077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1723077f23c2SHong Zhang     ca_den_ptr = ca_den;
1724077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1725077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1726077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1727077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1728077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1729077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1730077f23c2SHong Zhang       }
1731077f23c2SHong Zhang       ca_den_ptr += m;
1732077f23c2SHong Zhang     }
1733f99a636bSHong Zhang   }
1734f99a636bSHong Zhang 
17351683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1736f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1737077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1738f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1739e88777f2SHong Zhang   } else {
1740077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1741e88777f2SHong Zhang   }
1742f99a636bSHong Zhang #endif
17438972f759SHong Zhang   PetscFunctionReturn(0);
17448972f759SHong Zhang }
17458972f759SHong Zhang 
1746b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1747b1683b59SHong Zhang {
1748b1683b59SHong Zhang   PetscErrorCode ierr;
1749e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
17501a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1751b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1752b1683b59SHong Zhang   IS             *isa;
1753b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1754e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1755e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1756e88777f2SHong Zhang   PetscBool      flg;
1757b1683b59SHong Zhang 
1758b1683b59SHong Zhang   PetscFunctionBegin;
1759071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1760e99be685SHong Zhang 
17617ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1762e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1763b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1764e88777f2SHong Zhang   c->N      = Nbs;
1765e88777f2SHong Zhang   c->m      = c->M;
1766b1683b59SHong Zhang   c->rstart = 0;
1767e88777f2SHong Zhang   c->brows  = 100;
1768b1683b59SHong Zhang 
1769b1683b59SHong Zhang   c->ncolors = nis;
1770dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1771854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1772854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1773e88777f2SHong Zhang 
1774e88777f2SHong Zhang   brows = c->brows;
1775c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1776e88777f2SHong Zhang   if (flg) c->brows = brows;
1777eeb4fd02SHong Zhang   if (brows > 0) {
1778854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1779e88777f2SHong Zhang   }
17802205254eSKarl Rupp 
1781d2853540SHong Zhang   colorforrow[0] = 0;
1782d2853540SHong Zhang   rows_i         = rows;
1783f99a636bSHong Zhang   den2sp_i       = den2sp;
1784b1683b59SHong Zhang 
1785854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1786854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
17872205254eSKarl Rupp 
1788d2853540SHong Zhang   colorforcol[0] = 0;
1789d2853540SHong Zhang   columns_i      = columns;
1790d2853540SHong Zhang 
1791077f23c2SHong Zhang   /* get column-wise storage of mat */
1792077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1793b1683b59SHong Zhang 
1794b28d80bdSHong Zhang   cm   = c->m;
1795854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1796854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1797f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1798b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1799b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
18002205254eSKarl Rupp 
1801b1683b59SHong Zhang     c->ncolumns[i] = n;
1802b1683b59SHong Zhang     if (n) {
1803580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1804b1683b59SHong Zhang     }
1805d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1806d2853540SHong Zhang     columns_i       += n;
1807b1683b59SHong Zhang 
1808b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1809580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1810e99be685SHong Zhang 
1811b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1812b1683b59SHong Zhang       col     = is[j];
1813d2853540SHong Zhang       row_idx = cj + ci[col];
1814b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1815b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1816e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1817d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1818b1683b59SHong Zhang       }
1819b1683b59SHong Zhang     }
1820b1683b59SHong Zhang     /* count the number of hits */
1821b1683b59SHong Zhang     nrows = 0;
1822e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1823b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1824b1683b59SHong Zhang     }
1825b1683b59SHong Zhang     c->nrows[i]      = nrows;
1826d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1827d2853540SHong Zhang 
1828b1683b59SHong Zhang     nrows = 0;
1829b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1830b1683b59SHong Zhang       if (rowhit[j]) {
1831d2853540SHong Zhang         rows_i[nrows]   = j;
183212b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1833b1683b59SHong Zhang         nrows++;
1834b1683b59SHong Zhang       }
1835b1683b59SHong Zhang     }
1836e88777f2SHong Zhang     den2sp_i += nrows;
1837077f23c2SHong Zhang 
1838b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1839f99a636bSHong Zhang     rows_i += nrows;
1840b1683b59SHong Zhang   }
18410298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1842b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1843071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1844d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1845b28d80bdSHong Zhang 
1846d2853540SHong Zhang   c->colorforrow = colorforrow;
1847d2853540SHong Zhang   c->rows        = rows;
1848f99a636bSHong Zhang   c->den2sp      = den2sp;
1849d2853540SHong Zhang   c->colorforcol = colorforcol;
1850d2853540SHong Zhang   c->columns     = columns;
18512205254eSKarl Rupp 
1852f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1853b1683b59SHong Zhang   PetscFunctionReturn(0);
1854b1683b59SHong Zhang }
1855d013fc79Sandi selinger 
18564222ddf1SHong Zhang /* --------------------------------------------------------------- */
18574222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1858df97dc6dSFande Kong {
18594222ddf1SHong Zhang   PetscErrorCode ierr;
18604222ddf1SHong Zhang   Mat_Product    *product = C->product;
18614222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
18624222ddf1SHong Zhang 
1863df97dc6dSFande Kong   PetscFunctionBegin;
18644222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
18654222ddf1SHong Zhang     /* Alg: "outerproduct" */
1866*6718818eSStefano Zampini     ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
18674222ddf1SHong Zhang   } else {
18684222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
1869*6718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1870*6718818eSStefano Zampini     Mat                 At;
18714222ddf1SHong Zhang 
1872*6718818eSStefano Zampini     if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1873*6718818eSStefano Zampini     At = atb->At;
18744222ddf1SHong Zhang     if (atb->updateAt) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
18754222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
18764222ddf1SHong Zhang     }
18774222ddf1SHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At,B,C);CHKERRQ(ierr);
18784222ddf1SHong Zhang     atb->updateAt = PETSC_TRUE;
18794222ddf1SHong Zhang   }
1880df97dc6dSFande Kong   PetscFunctionReturn(0);
1881df97dc6dSFande Kong }
1882df97dc6dSFande Kong 
18834222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1884d013fc79Sandi selinger {
1885d013fc79Sandi selinger   PetscErrorCode ierr;
18864222ddf1SHong Zhang   Mat_Product    *product = C->product;
18874222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
18884222ddf1SHong Zhang   PetscReal      fill=product->fill;
1889d013fc79Sandi selinger 
1890d013fc79Sandi selinger   PetscFunctionBegin;
18914222ddf1SHong Zhang   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
18922869b61bSandi selinger 
18934222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
18944222ddf1SHong Zhang   PetscFunctionReturn(0);
18952869b61bSandi selinger }
1896d013fc79Sandi selinger 
18974222ddf1SHong Zhang /* --------------------------------------------------------------- */
18984222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
18994222ddf1SHong Zhang {
19004222ddf1SHong Zhang   PetscErrorCode ierr;
19014222ddf1SHong Zhang   Mat_Product    *product = C->product;
19024222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19034222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19044222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19054222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
19064222ddf1SHong Zhang   PetscInt       nalg = 7;
19074222ddf1SHong Zhang #else
19084222ddf1SHong Zhang   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
19094222ddf1SHong Zhang   PetscInt       nalg = 8;
19104222ddf1SHong Zhang #endif
19114222ddf1SHong Zhang 
19124222ddf1SHong Zhang   PetscFunctionBegin;
19134222ddf1SHong Zhang   /* Set default algorithm */
19144222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
19154222ddf1SHong Zhang   if (flg) {
19164222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1917d013fc79Sandi selinger   }
1918d013fc79Sandi selinger 
19194222ddf1SHong Zhang   /* Get runtime option */
19204222ddf1SHong Zhang   if (product->api_user) {
19214222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
19224222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
19234222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
19244222ddf1SHong Zhang   } else {
19254222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
19264222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
19274222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1928d013fc79Sandi selinger   }
19294222ddf1SHong Zhang   if (flg) {
19304222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1931d013fc79Sandi selinger   }
1932d013fc79Sandi selinger 
19334222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
19344222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
19354222ddf1SHong Zhang   PetscFunctionReturn(0);
19364222ddf1SHong Zhang }
1937d013fc79Sandi selinger 
19384222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
19394222ddf1SHong Zhang {
19404222ddf1SHong Zhang   PetscErrorCode ierr;
19414222ddf1SHong Zhang   Mat_Product    *product = C->product;
19424222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19434222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19444222ddf1SHong Zhang   const char     *algTypes[2] = {"at*b","outerproduct"};
19454222ddf1SHong Zhang   PetscInt       nalg = 2;
1946d013fc79Sandi selinger 
19474222ddf1SHong Zhang   PetscFunctionBegin;
19484222ddf1SHong Zhang   /* Set default algorithm */
19494222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
19504222ddf1SHong Zhang   if (flg) {
19514222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
19524222ddf1SHong Zhang   }
1953d013fc79Sandi selinger 
19544222ddf1SHong Zhang   /* Get runtime option */
19554222ddf1SHong Zhang   if (product->api_user) {
19564222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
19574222ddf1SHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
19584222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
19594222ddf1SHong Zhang   } else {
19604222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
19614222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
19624222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
19634222ddf1SHong Zhang   }
19644222ddf1SHong Zhang   if (flg) {
19654222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
19664222ddf1SHong Zhang   }
1967d013fc79Sandi selinger 
19684222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
19694222ddf1SHong Zhang   PetscFunctionReturn(0);
19704222ddf1SHong Zhang }
19714222ddf1SHong Zhang 
19724222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
19734222ddf1SHong Zhang {
19744222ddf1SHong Zhang   PetscErrorCode ierr;
19754222ddf1SHong Zhang   Mat_Product    *product = C->product;
19764222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19774222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19784222ddf1SHong Zhang   const char     *algTypes[2] = {"default","color"};
19794222ddf1SHong Zhang   PetscInt       nalg = 2;
19804222ddf1SHong Zhang 
19814222ddf1SHong Zhang   PetscFunctionBegin;
19824222ddf1SHong Zhang   /* Set default algorithm */
19834222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
19844222ddf1SHong Zhang   if (!flg) {
19854222ddf1SHong Zhang     alg = 1;
19864222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
19874222ddf1SHong Zhang   }
19884222ddf1SHong Zhang 
19894222ddf1SHong Zhang   /* Get runtime option */
19904222ddf1SHong Zhang   if (product->api_user) {
19914222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
19924222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
19934222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
19944222ddf1SHong Zhang   } else {
19954222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
19964222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
19974222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
19984222ddf1SHong Zhang   }
19994222ddf1SHong Zhang   if (flg) {
20004222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20014222ddf1SHong Zhang   }
20024222ddf1SHong Zhang 
20034222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20044222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20054222ddf1SHong Zhang   PetscFunctionReturn(0);
20064222ddf1SHong Zhang }
20074222ddf1SHong Zhang 
20084222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
20094222ddf1SHong Zhang {
20104222ddf1SHong Zhang   PetscErrorCode ierr;
20114222ddf1SHong Zhang   Mat_Product    *product = C->product;
20124222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20134222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
20144222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20154222ddf1SHong Zhang   const char      *algTypes[2] = {"scalable","rap"};
20164222ddf1SHong Zhang   PetscInt        nalg = 2;
20174222ddf1SHong Zhang #else
20184222ddf1SHong Zhang   const char      *algTypes[3] = {"scalable","rap","hypre"};
20194222ddf1SHong Zhang   PetscInt        nalg = 3;
20204222ddf1SHong Zhang #endif
20214222ddf1SHong Zhang 
20224222ddf1SHong Zhang   PetscFunctionBegin;
20234222ddf1SHong Zhang   /* Set default algorithm */
20244222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
20254222ddf1SHong Zhang   if (flg) {
20264222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20274222ddf1SHong Zhang   }
20284222ddf1SHong Zhang 
20294222ddf1SHong Zhang   /* Get runtime option */
20304222ddf1SHong Zhang   if (product->api_user) {
20314222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
20324222ddf1SHong Zhang     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20334222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20344222ddf1SHong Zhang   } else {
20354222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
20364222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20374222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20384222ddf1SHong Zhang   }
20394222ddf1SHong Zhang   if (flg) {
20404222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20414222ddf1SHong Zhang   }
20424222ddf1SHong Zhang 
20434222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
20444222ddf1SHong Zhang   PetscFunctionReturn(0);
20454222ddf1SHong Zhang }
20464222ddf1SHong Zhang 
20474222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
20484222ddf1SHong Zhang {
20494222ddf1SHong Zhang   PetscErrorCode ierr;
20504222ddf1SHong Zhang   Mat_Product    *product = C->product;
20514222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20524222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20534222ddf1SHong Zhang   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
20544222ddf1SHong Zhang   PetscInt        nalg = 3;
20554222ddf1SHong Zhang 
20564222ddf1SHong Zhang   PetscFunctionBegin;
20574222ddf1SHong Zhang   /* Set default algorithm */
20584222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
20594222ddf1SHong Zhang   if (flg) {
20604222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20614222ddf1SHong Zhang   }
20624222ddf1SHong Zhang 
20634222ddf1SHong Zhang   /* Get runtime option */
20644222ddf1SHong Zhang   if (product->api_user) {
20654222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
20664222ddf1SHong Zhang     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20674222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20684222ddf1SHong Zhang   } else {
20694222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
20704222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20714222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20724222ddf1SHong Zhang   }
20734222ddf1SHong Zhang   if (flg) {
20744222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20754222ddf1SHong Zhang   }
20764222ddf1SHong Zhang 
20774222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
20784222ddf1SHong Zhang   PetscFunctionReturn(0);
20794222ddf1SHong Zhang }
20804222ddf1SHong Zhang 
20814222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
20824222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
20834222ddf1SHong Zhang {
20844222ddf1SHong Zhang   PetscErrorCode ierr;
20854222ddf1SHong Zhang   Mat_Product    *product = C->product;
20864222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20874222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20884222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
20894222ddf1SHong Zhang   PetscInt       nalg = 7;
20904222ddf1SHong Zhang 
20914222ddf1SHong Zhang   PetscFunctionBegin;
20924222ddf1SHong Zhang   /* Set default algorithm */
20934222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
20944222ddf1SHong Zhang   if (flg) {
20954222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20964222ddf1SHong Zhang   }
20974222ddf1SHong Zhang 
20984222ddf1SHong Zhang   /* Get runtime option */
20994222ddf1SHong Zhang   if (product->api_user) {
21004222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
21014222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21024222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21034222ddf1SHong Zhang   } else {
21044222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
21054222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21064222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21074222ddf1SHong Zhang   }
21084222ddf1SHong Zhang   if (flg) {
21094222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21104222ddf1SHong Zhang   }
21114222ddf1SHong Zhang 
21124222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21134222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21144222ddf1SHong Zhang   PetscFunctionReturn(0);
21154222ddf1SHong Zhang }
21164222ddf1SHong Zhang 
21174222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
21184222ddf1SHong Zhang {
21194222ddf1SHong Zhang   PetscErrorCode ierr;
21204222ddf1SHong Zhang   Mat_Product    *product = C->product;
21214222ddf1SHong Zhang 
21224222ddf1SHong Zhang   PetscFunctionBegin;
21234222ddf1SHong Zhang   switch (product->type) {
21244222ddf1SHong Zhang   case MATPRODUCT_AB:
21254222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
21264222ddf1SHong Zhang     break;
21274222ddf1SHong Zhang   case MATPRODUCT_AtB:
21284222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
21294222ddf1SHong Zhang     break;
21304222ddf1SHong Zhang   case MATPRODUCT_ABt:
21314222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
21324222ddf1SHong Zhang     break;
21334222ddf1SHong Zhang   case MATPRODUCT_PtAP:
21344222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
21354222ddf1SHong Zhang     break;
21364222ddf1SHong Zhang   case MATPRODUCT_RARt:
21374222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
21384222ddf1SHong Zhang     break;
21394222ddf1SHong Zhang   case MATPRODUCT_ABC:
21404222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
21414222ddf1SHong Zhang     break;
2142*6718818eSStefano Zampini   default:
2143*6718818eSStefano Zampini     break;
21444222ddf1SHong Zhang   }
2145d013fc79Sandi selinger   PetscFunctionReturn(0);
2146d013fc79Sandi selinger }
2147