xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 2e5835c6908701226356db06d805ef70170dbd55)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14df97dc6dSFande Kong {
15df97dc6dSFande Kong   PetscErrorCode ierr;
16df97dc6dSFande Kong 
17df97dc6dSFande Kong   PetscFunctionBegin;
18df97dc6dSFande Kong   if (C->ops->matmultnumeric) {
196718818eSStefano Zampini     if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
20df97dc6dSFande Kong     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
21df97dc6dSFande Kong   } else {
22df97dc6dSFande Kong     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
23df97dc6dSFande Kong   }
24df97dc6dSFande Kong   PetscFunctionReturn(0);
25df97dc6dSFande Kong }
26df97dc6dSFande Kong 
274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
29df97dc6dSFande Kong {
30df97dc6dSFande Kong   PetscErrorCode ierr;
314222ddf1SHong Zhang   PetscInt       ii;
324222ddf1SHong Zhang   Mat_SeqAIJ     *aij;
33e4e71118SStefano Zampini   PetscBool      isseqaij;
345c66b693SKris Buschelman 
355c66b693SKris Buschelman   PetscFunctionBegin;
364222ddf1SHong Zhang   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
374222ddf1SHong Zhang   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
384222ddf1SHong Zhang 
39e4e71118SStefano Zampini   if (!mtype) {
40e4e71118SStefano Zampini     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
41e4e71118SStefano Zampini     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
42e4e71118SStefano Zampini   } else {
43e4e71118SStefano Zampini     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
44e4e71118SStefano Zampini   }
45f4259b30SLisandro Dalcin   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
464222ddf1SHong Zhang   aij  = (Mat_SeqAIJ*)(mat)->data;
474222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
484222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
494222ddf1SHong Zhang 
504222ddf1SHong Zhang   aij->i            = i;
514222ddf1SHong Zhang   aij->j            = j;
524222ddf1SHong Zhang   aij->a            = a;
534222ddf1SHong Zhang   aij->singlemalloc = PETSC_FALSE;
544222ddf1SHong Zhang   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
554222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
564222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
574222ddf1SHong Zhang 
584222ddf1SHong Zhang   for (ii=0; ii<m; ii++) {
594222ddf1SHong Zhang     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
6025023028SHong Zhang   }
614222ddf1SHong Zhang 
625c66b693SKris Buschelman   PetscFunctionReturn(0);
635c66b693SKris Buschelman }
641c24bd37SHong Zhang 
654222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
664222ddf1SHong Zhang {
674222ddf1SHong Zhang   PetscErrorCode      ierr;
684222ddf1SHong Zhang   Mat_Product         *product = C->product;
694222ddf1SHong Zhang   MatProductAlgorithm alg;
704222ddf1SHong Zhang   PetscBool           flg;
714222ddf1SHong Zhang 
724222ddf1SHong Zhang   PetscFunctionBegin;
734222ddf1SHong Zhang   if (product) {
744222ddf1SHong Zhang     alg = product->alg;
754222ddf1SHong Zhang   } else {
764222ddf1SHong Zhang     alg = "sorted";
774222ddf1SHong Zhang   }
784222ddf1SHong Zhang   /* sorted */
794222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
804222ddf1SHong Zhang   if (flg) {
814222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
824222ddf1SHong Zhang     PetscFunctionReturn(0);
834222ddf1SHong Zhang   }
844222ddf1SHong Zhang 
854222ddf1SHong Zhang   /* scalable */
864222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
874222ddf1SHong Zhang   if (flg) {
884222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
894222ddf1SHong Zhang     PetscFunctionReturn(0);
904222ddf1SHong Zhang   }
914222ddf1SHong Zhang 
924222ddf1SHong Zhang   /* scalable_fast */
934222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
944222ddf1SHong Zhang   if (flg) {
954222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
964222ddf1SHong Zhang     PetscFunctionReturn(0);
974222ddf1SHong Zhang   }
984222ddf1SHong Zhang 
994222ddf1SHong Zhang   /* heap */
1004222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
1014222ddf1SHong Zhang   if (flg) {
1024222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
1034222ddf1SHong Zhang     PetscFunctionReturn(0);
1044222ddf1SHong Zhang   }
1054222ddf1SHong Zhang 
1064222ddf1SHong Zhang   /* btheap */
1074222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
1084222ddf1SHong Zhang   if (flg) {
1094222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
1104222ddf1SHong Zhang     PetscFunctionReturn(0);
1114222ddf1SHong Zhang   }
1124222ddf1SHong Zhang 
1134222ddf1SHong Zhang   /* llcondensed */
1144222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
1154222ddf1SHong Zhang   if (flg) {
1164222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
1174222ddf1SHong Zhang     PetscFunctionReturn(0);
1184222ddf1SHong Zhang   }
1194222ddf1SHong Zhang 
1204222ddf1SHong Zhang   /* rowmerge */
1214222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
1224222ddf1SHong Zhang   if (flg) {
1234222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
1244222ddf1SHong Zhang     PetscFunctionReturn(0);
1254222ddf1SHong Zhang   }
1264222ddf1SHong Zhang 
1274222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1284222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
1294222ddf1SHong Zhang   if (flg) {
1304222ddf1SHong Zhang     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
1314222ddf1SHong Zhang     PetscFunctionReturn(0);
1324222ddf1SHong Zhang   }
1334222ddf1SHong Zhang #endif
1344222ddf1SHong Zhang 
1354222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1364222ddf1SHong Zhang }
1374222ddf1SHong Zhang 
1384222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
139b561aa9dSHong Zhang {
140b561aa9dSHong Zhang   PetscErrorCode     ierr;
141b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
142971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
143c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
144b561aa9dSHong Zhang   PetscReal          afill;
145eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
146eca6b86aSHong Zhang   PetscTable         ta;
147fb908031SHong Zhang   PetscBT            lnkbt;
1480298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
149b561aa9dSHong Zhang 
150b561aa9dSHong Zhang   PetscFunctionBegin;
151bd958071SHong Zhang   /* Get ci and cj */
152bd958071SHong Zhang   /*---------------*/
153fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
154fb908031SHong Zhang   /* free space for accumulating nonzero column info */
155854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
156fb908031SHong Zhang   ci[0] = 0;
157fb908031SHong Zhang 
158fb908031SHong Zhang   /* create and initialize a linked list */
159c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
160c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
161eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
162eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
163eca6b86aSHong Zhang 
164eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
165fb908031SHong Zhang 
166fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
167f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1682205254eSKarl Rupp 
169fb908031SHong Zhang   current_space = free_space;
170fb908031SHong Zhang 
171fb908031SHong Zhang   /* Determine ci and cj */
172fb908031SHong Zhang   for (i=0; i<am; i++) {
173fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
174fb908031SHong Zhang     aj   = a->j + ai[i];
175fb908031SHong Zhang     for (j=0; j<anzi; j++) {
176fb908031SHong Zhang       brow = aj[j];
177fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
178fb908031SHong Zhang       bj   = b->j + bi[brow];
179fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
180fb908031SHong Zhang       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
181fb908031SHong Zhang     }
1828c78258cSHong Zhang     /* add possible missing diagonal entry */
1838c78258cSHong Zhang     if (C->force_diagonals) {
1848c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr);
1858c78258cSHong Zhang     }
186fb908031SHong Zhang     cnzi = lnk[0];
187fb908031SHong Zhang 
188fb908031SHong Zhang     /* If free space is not available, make more free space */
189fb908031SHong Zhang     /* Double the amount of total space in the list */
190fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
191f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
192fb908031SHong Zhang       ndouble++;
193fb908031SHong Zhang     }
194fb908031SHong Zhang 
195fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
196fb908031SHong Zhang     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1972205254eSKarl Rupp 
198fb908031SHong Zhang     current_space->array           += cnzi;
199fb908031SHong Zhang     current_space->local_used      += cnzi;
200fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2012205254eSKarl Rupp 
202fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
203fb908031SHong Zhang   }
204fb908031SHong Zhang 
205fb908031SHong Zhang   /* Column indices are in the list of free space */
206fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
207fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
208854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
209fb908031SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
210fb908031SHong Zhang   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
211b561aa9dSHong Zhang 
21226be0446SHong Zhang   /* put together the new symbolic matrix */
213e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
2144222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
21558c24d83SHong Zhang 
21658c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
21758c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2184222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
219aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
220e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22158c24d83SHong Zhang   c->nonew   = 0;
2224222ddf1SHong Zhang 
2234222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2244222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2250b7e3e3dSHong Zhang 
2267212da7cSHong Zhang   /* set MatInfo */
2277212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
228f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2297212da7cSHong Zhang   c->maxnz                  = ci[am];
2307212da7cSHong Zhang   c->nz                     = ci[am];
2314222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2324222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2334222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2347212da7cSHong Zhang 
2357212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2367212da7cSHong Zhang   if (ci[am]) {
2374222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
2384222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
239f2b054eeSHong Zhang   } else {
2404222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
241be0fcf8dSHong Zhang   }
242f2b054eeSHong Zhang #endif
24358c24d83SHong Zhang   PetscFunctionReturn(0);
24458c24d83SHong Zhang }
245d50806bdSBarry Smith 
246df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
247d50806bdSBarry Smith {
248dfbe8321SBarry Smith   PetscErrorCode    ierr;
249d13dce4bSSatish Balay   PetscLogDouble    flops=0.0;
250d50806bdSBarry Smith   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
251d50806bdSBarry Smith   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
252d50806bdSBarry Smith   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
25338baddfdSBarry Smith   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
254c58ca2e3SHong Zhang   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
255519eb980SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
256*2e5835c6SStefano Zampini   PetscScalar       *ca,valtmp;
257aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2586718818eSStefano Zampini   PetscContainer    cab_dense;
259*2e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
260d50806bdSBarry Smith 
261d50806bdSBarry Smith   PetscFunctionBegin;
262*2e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr);
263*2e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr);
264aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
265854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
266aa1aec99SHong Zhang     c->a      = ca;
267aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2686718818eSStefano Zampini   } else ca = c->a;
2696718818eSStefano Zampini 
2706718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2716718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2726718818eSStefano Zampini      that is hard to eradicate) */
2736718818eSStefano Zampini   ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr);
2746718818eSStefano Zampini   if (!cab_dense) {
2756718818eSStefano Zampini     ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
2766718818eSStefano Zampini     ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr);
2776718818eSStefano Zampini     ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr);
2786718818eSStefano Zampini     ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr);
2796718818eSStefano Zampini     ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr);
2806718818eSStefano Zampini     ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr);
281d90d86d0SMatthew G. Knepley   }
2826718818eSStefano Zampini   ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr);
2836718818eSStefano Zampini   ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr);
284aa1aec99SHong Zhang 
285c124e916SHong Zhang   /* clean old values in C */
286580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
287d50806bdSBarry Smith   /* Traverse A row-wise. */
288d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
289d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
290d50806bdSBarry Smith   for (i=0; i<am; i++) {
291d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
292d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
293519eb980SHong Zhang       brow = aj[j];
294d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
295d50806bdSBarry Smith       bjj  = bj + bi[brow];
296d50806bdSBarry Smith       baj  = ba + bi[brow];
297519eb980SHong Zhang       /* perform dense axpy */
29836ec6d2dSHong Zhang       valtmp = aa[j];
299519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
30036ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
301519eb980SHong Zhang       }
302519eb980SHong Zhang       flops += 2*bnzi;
303519eb980SHong Zhang     }
304c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
305c58ca2e3SHong Zhang 
306c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
307519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
308519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
309519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
310519eb980SHong Zhang     }
311519eb980SHong Zhang     flops += cnzi;
312519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
313519eb980SHong Zhang   }
314*2e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
315*2e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
316*2e5835c6SStefano Zampini #endif
317c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
318c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
319c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
320*2e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr);
321*2e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr);
322c58ca2e3SHong Zhang   PetscFunctionReturn(0);
323c58ca2e3SHong Zhang }
324c58ca2e3SHong Zhang 
32525023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
326c58ca2e3SHong Zhang {
327c58ca2e3SHong Zhang   PetscErrorCode    ierr;
328c58ca2e3SHong Zhang   PetscLogDouble    flops=0.0;
329c58ca2e3SHong Zhang   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
330c58ca2e3SHong Zhang   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
331c58ca2e3SHong Zhang   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
332c58ca2e3SHong Zhang   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
333c58ca2e3SHong Zhang   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
334c58ca2e3SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
335*2e5835c6SStefano Zampini   PetscScalar       *ca=c->a,valtmp;
336*2e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
337c58ca2e3SHong Zhang   PetscInt          nextb;
338c58ca2e3SHong Zhang 
339c58ca2e3SHong Zhang   PetscFunctionBegin;
340*2e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr);
341*2e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr);
342cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
343cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
344cf742fccSHong Zhang     c->a      = ca;
345cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
346cf742fccSHong Zhang   }
347cf742fccSHong Zhang 
348c58ca2e3SHong Zhang   /* clean old values in C */
349580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
350c58ca2e3SHong Zhang   /* Traverse A row-wise. */
351c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
352c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
353519eb980SHong Zhang   for (i=0; i<am; i++) {
354519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
355519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
356519eb980SHong Zhang     for (j=0; j<anzi; j++) {
357519eb980SHong Zhang       brow = aj[j];
358519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
359519eb980SHong Zhang       bjj  = bj + bi[brow];
360519eb980SHong Zhang       baj  = ba + bi[brow];
361519eb980SHong Zhang       /* perform sparse axpy */
36236ec6d2dSHong Zhang       valtmp = aa[j];
363c124e916SHong Zhang       nextb  = 0;
364c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
365c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
36636ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
367c124e916SHong Zhang         }
368d50806bdSBarry Smith       }
369d50806bdSBarry Smith       flops += 2*bnzi;
370d50806bdSBarry Smith     }
371519eb980SHong Zhang     aj += anzi; aa += anzi;
372519eb980SHong Zhang     cj += cnzi; ca += cnzi;
373d50806bdSBarry Smith   }
374*2e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
375*2e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
376*2e5835c6SStefano Zampini #endif
377716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
378716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
379d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
380*2e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr);
381*2e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr);
382d50806bdSBarry Smith   PetscFunctionReturn(0);
383d50806bdSBarry Smith }
384bc011b1eSHong Zhang 
3854222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
38625296bd5SBarry Smith {
38725296bd5SBarry Smith   PetscErrorCode     ierr;
38825296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
38925296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3903c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
39125296bd5SBarry Smith   MatScalar          *ca;
39225296bd5SBarry Smith   PetscReal          afill;
393eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
394eca6b86aSHong Zhang   PetscTable         ta;
3950298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
39625296bd5SBarry Smith 
39725296bd5SBarry Smith   PetscFunctionBegin;
3983c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3993c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
4003c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
401854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
4023c50cad2SHong Zhang   ci[0] = 0;
4033c50cad2SHong Zhang 
4043c50cad2SHong Zhang   /* create and initialize a linked list */
405c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
406c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
407eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
408eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
409eca6b86aSHong Zhang 
410eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
4113c50cad2SHong Zhang 
4123c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
413f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
4143c50cad2SHong Zhang   current_space = free_space;
4153c50cad2SHong Zhang 
4163c50cad2SHong Zhang   /* Determine ci and cj */
4173c50cad2SHong Zhang   for (i=0; i<am; i++) {
4183c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
4193c50cad2SHong Zhang     aj   = a->j + ai[i];
4203c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
4213c50cad2SHong Zhang       brow = aj[j];
4223c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
4233c50cad2SHong Zhang       bj   = b->j + bi[brow];
4243c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4253c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
4263c50cad2SHong Zhang     }
4278c78258cSHong Zhang     /* add possible missing diagonal entry */
4288c78258cSHong Zhang     if (C->force_diagonals) {
4298c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr);
4308c78258cSHong Zhang     }
4313c50cad2SHong Zhang     cnzi = lnk[1];
4323c50cad2SHong Zhang 
4333c50cad2SHong Zhang     /* If free space is not available, make more free space */
4343c50cad2SHong Zhang     /* Double the amount of total space in the list */
4353c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
436f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
4373c50cad2SHong Zhang       ndouble++;
4383c50cad2SHong Zhang     }
4393c50cad2SHong Zhang 
4403c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4413c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4422205254eSKarl Rupp 
4433c50cad2SHong Zhang     current_space->array           += cnzi;
4443c50cad2SHong Zhang     current_space->local_used      += cnzi;
4453c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4462205254eSKarl Rupp 
4473c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
4483c50cad2SHong Zhang   }
4493c50cad2SHong Zhang 
4503c50cad2SHong Zhang   /* Column indices are in the list of free space */
4513c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4523c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
453854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
4543c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
4553c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
45625296bd5SBarry Smith 
45725296bd5SBarry Smith   /* Allocate space for ca */
458580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
45925296bd5SBarry Smith 
46025296bd5SBarry Smith   /* put together the new symbolic matrix */
461e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
4624222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
46325296bd5SBarry Smith 
46425296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
46525296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4664222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
46725296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
46825296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
46925296bd5SBarry Smith   c->nonew   = 0;
4702205254eSKarl Rupp 
4714222ddf1SHong Zhang   /* slower, less memory */
4724222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47325296bd5SBarry Smith 
47425296bd5SBarry Smith   /* set MatInfo */
47525296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
47625296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
47725296bd5SBarry Smith   c->maxnz                     = ci[am];
47825296bd5SBarry Smith   c->nz                        = ci[am];
4794222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4804222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4814222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
48225296bd5SBarry Smith 
48325296bd5SBarry Smith #if defined(PETSC_USE_INFO)
48425296bd5SBarry Smith   if (ci[am]) {
4854222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
4864222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
48725296bd5SBarry Smith   } else {
4884222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
48925296bd5SBarry Smith   }
49025296bd5SBarry Smith #endif
49125296bd5SBarry Smith   PetscFunctionReturn(0);
49225296bd5SBarry Smith }
49325296bd5SBarry Smith 
4944222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
495e9e4536cSHong Zhang {
496e9e4536cSHong Zhang   PetscErrorCode     ierr;
497e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
498bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
49925c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
500e9e4536cSHong Zhang   MatScalar          *ca;
501e9e4536cSHong Zhang   PetscReal          afill;
502eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
503eca6b86aSHong Zhang   PetscTable         ta;
5040298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
505e9e4536cSHong Zhang 
506e9e4536cSHong Zhang   PetscFunctionBegin;
507bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
508bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
509bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
510854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
511bd958071SHong Zhang   ci[0] = 0;
512bd958071SHong Zhang 
513bd958071SHong Zhang   /* create and initialize a linked list */
514c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
515c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
516eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
517eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
518eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
519bd958071SHong Zhang 
520bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
521f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
522bd958071SHong Zhang   current_space = free_space;
523bd958071SHong Zhang 
524bd958071SHong Zhang   /* Determine ci and cj */
525bd958071SHong Zhang   for (i=0; i<am; i++) {
526bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
527bd958071SHong Zhang     aj   = a->j + ai[i];
528bd958071SHong Zhang     for (j=0; j<anzi; j++) {
529bd958071SHong Zhang       brow = aj[j];
530bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
531bd958071SHong Zhang       bj   = b->j + bi[brow];
532bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
533bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
534bd958071SHong Zhang     }
5358c78258cSHong Zhang     /* add possible missing diagonal entry */
5368c78258cSHong Zhang     if (C->force_diagonals) {
5378c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr);
5388c78258cSHong Zhang     }
5398c78258cSHong Zhang 
540bd958071SHong Zhang     cnzi = lnk[0];
541bd958071SHong Zhang 
542bd958071SHong Zhang     /* If free space is not available, make more free space */
543bd958071SHong Zhang     /* Double the amount of total space in the list */
544bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
545f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
546bd958071SHong Zhang       ndouble++;
547bd958071SHong Zhang     }
548bd958071SHong Zhang 
549bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
550bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
5512205254eSKarl Rupp 
552bd958071SHong Zhang     current_space->array           += cnzi;
553bd958071SHong Zhang     current_space->local_used      += cnzi;
554bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5552205254eSKarl Rupp 
556bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
557bd958071SHong Zhang   }
558bd958071SHong Zhang 
559bd958071SHong Zhang   /* Column indices are in the list of free space */
560bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
561bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
562854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
563bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
564bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
565e9e4536cSHong Zhang 
566e9e4536cSHong Zhang   /* Allocate space for ca */
567bd958071SHong Zhang   /*-----------------------*/
568580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
569e9e4536cSHong Zhang 
570e9e4536cSHong Zhang   /* put together the new symbolic matrix */
571e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
5724222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
573e9e4536cSHong Zhang 
574e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
575e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5764222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
577e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
578e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
579e9e4536cSHong Zhang   c->nonew   = 0;
5802205254eSKarl Rupp 
5814222ddf1SHong Zhang   /* slower, less memory */
5824222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
583e9e4536cSHong Zhang 
584e9e4536cSHong Zhang   /* set MatInfo */
585e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
586e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
587e9e4536cSHong Zhang   c->maxnz                     = ci[am];
588e9e4536cSHong Zhang   c->nz                        = ci[am];
5894222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5904222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5914222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
592e9e4536cSHong Zhang 
593e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
594e9e4536cSHong Zhang   if (ci[am]) {
5954222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
5964222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
597e9e4536cSHong Zhang   } else {
5984222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
599e9e4536cSHong Zhang   }
600e9e4536cSHong Zhang #endif
601e9e4536cSHong Zhang   PetscFunctionReturn(0);
602e9e4536cSHong Zhang }
603e9e4536cSHong Zhang 
6044222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
6050ced3a2bSJed Brown {
6060ced3a2bSJed Brown   PetscErrorCode     ierr;
6070ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6080ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6090ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
6100ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6110ced3a2bSJed Brown   PetscReal          afill;
6120ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
6130298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6140ced3a2bSJed Brown   PetscHeap          h;
6150ced3a2bSJed Brown 
6160ced3a2bSJed Brown   PetscFunctionBegin;
617cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6180ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
6190ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
620854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6210ced3a2bSJed Brown   ci[0] = 0;
6220ced3a2bSJed Brown 
6230ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
624f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6250ced3a2bSJed Brown   current_space = free_space;
6260ced3a2bSJed Brown 
6270ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
628785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6290ced3a2bSJed Brown 
6300ced3a2bSJed Brown   /* Determine ci and cj */
6310ced3a2bSJed Brown   for (i=0; i<am; i++) {
6320ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6330ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6340ced3a2bSJed Brown     ci[i+1] = ci[i];
6350ced3a2bSJed Brown     /* Populate the min heap */
6360ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
6370ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
6380ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
6390ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
6400ced3a2bSJed Brown       }
6410ced3a2bSJed Brown     }
6420ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6430ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6440ced3a2bSJed Brown     while (j >= 0) {
6450ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
646f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6470ced3a2bSJed Brown         ndouble++;
6480ced3a2bSJed Brown       }
6490ced3a2bSJed Brown       *(current_space->array++) = col;
6500ced3a2bSJed Brown       current_space->local_used++;
6510ced3a2bSJed Brown       current_space->local_remaining--;
6520ced3a2bSJed Brown       ci[i+1]++;
6530ced3a2bSJed Brown 
6540ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6550ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
6560ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
6570ced3a2bSJed Brown         PetscInt j2,col2;
6580ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
6590ced3a2bSJed Brown         if (col2 != col) break;
6600ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
6610ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
6620ced3a2bSJed Brown       }
6630ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6640ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
6650ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6660ced3a2bSJed Brown     }
6670ced3a2bSJed Brown   }
6680ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6690ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
6700ced3a2bSJed Brown 
6710ced3a2bSJed Brown   /* Column indices are in the list of free space */
6720ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6730ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
674785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
6750ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
6760ced3a2bSJed Brown 
6770ced3a2bSJed Brown   /* put together the new symbolic matrix */
678e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
6794222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
6800ced3a2bSJed Brown 
6810ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6820ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6834222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
6840ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6850ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6860ced3a2bSJed Brown   c->nonew   = 0;
68726fbe8dcSKarl Rupp 
6884222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6890ced3a2bSJed Brown 
6900ced3a2bSJed Brown   /* set MatInfo */
6910ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6920ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6930ced3a2bSJed Brown   c->maxnz                     = ci[am];
6940ced3a2bSJed Brown   c->nz                        = ci[am];
6954222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6964222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6974222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6980ced3a2bSJed Brown 
6990ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
7000ced3a2bSJed Brown   if (ci[am]) {
7014222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
7024222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7030ced3a2bSJed Brown   } else {
7044222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
7050ced3a2bSJed Brown   }
7060ced3a2bSJed Brown #endif
7070ced3a2bSJed Brown   PetscFunctionReturn(0);
7080ced3a2bSJed Brown }
709e9e4536cSHong Zhang 
7104222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
7118a07c6f1SJed Brown {
7128a07c6f1SJed Brown   PetscErrorCode     ierr;
7138a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
7148a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
7158a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
7168a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
7178a07c6f1SJed Brown   PetscReal          afill;
7188a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
7190298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
7208a07c6f1SJed Brown   PetscHeap          h;
7218a07c6f1SJed Brown   PetscBT            bt;
7228a07c6f1SJed Brown 
7238a07c6f1SJed Brown   PetscFunctionBegin;
724cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7258a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
7268a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
727854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
7288a07c6f1SJed Brown   ci[0] = 0;
7298a07c6f1SJed Brown 
7308a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
731f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
7322205254eSKarl Rupp 
7338a07c6f1SJed Brown   current_space = free_space;
7348a07c6f1SJed Brown 
7358a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
736785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
7378a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
7388a07c6f1SJed Brown 
7398a07c6f1SJed Brown   /* Determine ci and cj */
7408a07c6f1SJed Brown   for (i=0; i<am; i++) {
7418a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7428a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
7438a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7448a07c6f1SJed Brown     ci[i+1] = ci[i];
7458a07c6f1SJed Brown     /* Populate the min heap */
7468a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
7478a07c6f1SJed Brown       PetscInt brow = acol[j];
7488a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
7498a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7508a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7518a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7528a07c6f1SJed Brown           bb[j]++;
7538a07c6f1SJed Brown           break;
7548a07c6f1SJed Brown         }
7558a07c6f1SJed Brown       }
7568a07c6f1SJed Brown     }
7578a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7588a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7598a07c6f1SJed Brown     while (j >= 0) {
7608a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7610298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
762f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
7638a07c6f1SJed Brown         ndouble++;
7648a07c6f1SJed Brown       }
7658a07c6f1SJed Brown       *(current_space->array++) = col;
7668a07c6f1SJed Brown       current_space->local_used++;
7678a07c6f1SJed Brown       current_space->local_remaining--;
7688a07c6f1SJed Brown       ci[i+1]++;
7698a07c6f1SJed Brown 
7708a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7718a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
7728a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7738a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7748a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7758a07c6f1SJed Brown           bb[j]++;
7768a07c6f1SJed Brown           break;
7778a07c6f1SJed Brown         }
7788a07c6f1SJed Brown       }
7798a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7808a07c6f1SJed Brown     }
7818a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7828a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7838a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7848a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7858a07c6f1SJed Brown     }
7868a07c6f1SJed Brown   }
7878a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
7888a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7898a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7908a07c6f1SJed Brown 
7918a07c6f1SJed Brown   /* Column indices are in the list of free space */
7928a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7938a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
794785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7958a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7968a07c6f1SJed Brown 
7978a07c6f1SJed Brown   /* put together the new symbolic matrix */
798e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
7994222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
8008a07c6f1SJed Brown 
8018a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
8028a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
8034222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
8048a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
8058a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
8068a07c6f1SJed Brown   c->nonew   = 0;
80726fbe8dcSKarl Rupp 
8084222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
8098a07c6f1SJed Brown 
8108a07c6f1SJed Brown   /* set MatInfo */
8118a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
8128a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
8138a07c6f1SJed Brown   c->maxnz                     = ci[am];
8148a07c6f1SJed Brown   c->nz                        = ci[am];
8154222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8164222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8174222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8188a07c6f1SJed Brown 
8198a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8208a07c6f1SJed Brown   if (ci[am]) {
8214222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
8224222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
8238a07c6f1SJed Brown   } else {
8244222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
8258a07c6f1SJed Brown   }
8268a07c6f1SJed Brown #endif
8278a07c6f1SJed Brown   PetscFunctionReturn(0);
8288a07c6f1SJed Brown }
8298a07c6f1SJed Brown 
830d7ed1a4dSandi selinger 
8314222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
832d7ed1a4dSandi selinger {
833d7ed1a4dSandi selinger   PetscErrorCode     ierr;
834d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
835d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
836d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
837d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
838d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
839d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
840d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
841d7ed1a4dSandi selinger   PetscInt           window[8];
842d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
843d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
844d7ed1a4dSandi selinger   PetscReal          afill;
845f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
8467660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
847d7ed1a4dSandi selinger 
848d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
849d7ed1a4dSandi selinger              Because of the way virtual memory works,
850d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
851d7ed1a4dSandi selinger   PetscFunctionBegin;
852d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
853d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
854d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
855d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
856d7ed1a4dSandi selinger     a_rownnz = 0;
857d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
858d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
859d7ed1a4dSandi selinger       if (a_rownnz > bn) {
860d7ed1a4dSandi selinger         a_rownnz = bn;
861d7ed1a4dSandi selinger         break;
862d7ed1a4dSandi selinger       }
863d7ed1a4dSandi selinger     }
864d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
865d7ed1a4dSandi selinger   }
866d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
867d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
868f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
869f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
870d7ed1a4dSandi selinger 
8717660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8727660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
873d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
874d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
875d7ed1a4dSandi selinger 
876d7ed1a4dSandi selinger   ci_nnz       = 0;
877d7ed1a4dSandi selinger   ci[0]        = 0;
878d7ed1a4dSandi selinger   worki_L1[0]  = 0;
879d7ed1a4dSandi selinger   worki_L2[0]  = 0;
880d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
881d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
882d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
883d7ed1a4dSandi selinger     rowsleft             = anzi;
884d7ed1a4dSandi selinger     inputcol_L1          = acol;
885d7ed1a4dSandi selinger     L2_nnz               = 0;
8867660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8877660c4dbSandi selinger     worki_L2[1]          = 0;
88808fe1b3cSKarl Rupp     outputi_nnz          = 0;
889d7ed1a4dSandi selinger 
890d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
891d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
892d7ed1a4dSandi selinger       c_maxmem *= 2;
893d7ed1a4dSandi selinger       ndouble++;
894d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
895d7ed1a4dSandi selinger     }
896d7ed1a4dSandi selinger 
897d7ed1a4dSandi selinger     while (rowsleft) {
898d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
899d7ed1a4dSandi selinger       L1_nrows    = 0;
9007660c4dbSandi selinger       L1_nnz      = 0;
901d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
902d7ed1a4dSandi selinger       inputi      = bi;
903d7ed1a4dSandi selinger       inputj      = bj;
904d7ed1a4dSandi selinger 
905d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
906d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
907f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
908d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
909d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
910d7ed1a4dSandi selinger          window_min  = bn;                                                   \
9117660c4dbSandi selinger          outputi_nnz = 0;                                                    \
9127660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
913d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
914d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
915d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
916d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
917d7ed1a4dSandi selinger          }                                                                   \
918d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
919d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
920d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
921d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
922d7ed1a4dSandi selinger            window_min = bn;                                                  \
923d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
924d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
925d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
926d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
927d7ed1a4dSandi selinger              }                                                               \
928d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
929d7ed1a4dSandi selinger            }                                                                 \
930d7ed1a4dSandi selinger          }
931d7ed1a4dSandi selinger 
932d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
933d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
934d7ed1a4dSandi selinger       while (L1_rowsleft) {
9357660c4dbSandi selinger         outputi_nnz = 0;
9367660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
9377660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
9387660c4dbSandi selinger 
939d7ed1a4dSandi selinger         switch (L1_rowsleft) {
940d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
941d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
942d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
943d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
944d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
945d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
946d7ed1a4dSandi selinger                  break;
947d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
948d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
949d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
950d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
951d7ed1a4dSandi selinger                  break;
952d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
953d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
954d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
955d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
956d7ed1a4dSandi selinger                  break;
957d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
958d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
959d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
960d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
961d7ed1a4dSandi selinger                  break;
962d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
963d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
964d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
965d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
966d7ed1a4dSandi selinger                  break;
967d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
968d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
969d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
970d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
971d7ed1a4dSandi selinger                  break;
972d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
973d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
974d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
975d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
976d7ed1a4dSandi selinger                  break;
977d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
978d7ed1a4dSandi selinger                  inputcol    += 8;
979d7ed1a4dSandi selinger                  rowsleft    -= 8;
980d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
981d7ed1a4dSandi selinger                  break;
982d7ed1a4dSandi selinger         }
983d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9847660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9857660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
986d7ed1a4dSandi selinger       }
987d7ed1a4dSandi selinger 
988d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
989d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
990d7ed1a4dSandi selinger       if (anzi > 8) {
991d7ed1a4dSandi selinger         inputi      = worki_L1;
992d7ed1a4dSandi selinger         inputj      = workj_L1;
993d7ed1a4dSandi selinger         inputcol    = workcol;
994d7ed1a4dSandi selinger         outputi_nnz = 0;
995d7ed1a4dSandi selinger 
996d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
997d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
998d7ed1a4dSandi selinger 
999d7ed1a4dSandi selinger         switch (L1_nrows) {
1000d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1001d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
1002d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1003d7ed1a4dSandi selinger                  break;
1004d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1005d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1006d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1007d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1008d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1009d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1010d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1011d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1012d7ed1a4dSandi selinger         }
1013d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
1014d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
1015d7ed1a4dSandi selinger 
10167660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10177660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1018d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1019d7ed1a4dSandi selinger           inputi      = worki_L2;
1020d7ed1a4dSandi selinger           inputj      = workj_L2;
1021d7ed1a4dSandi selinger           inputcol    = workcol;
1022d7ed1a4dSandi selinger           outputi_nnz = 0;
1023f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1024d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
1025d7ed1a4dSandi selinger           switch (L2_nrows) {
1026d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1027d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1028d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1029d7ed1a4dSandi selinger                    break;
1030d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1031d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1032d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1033d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1034d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1035d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1036d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1037d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1038d7ed1a4dSandi selinger           }
1039d7ed1a4dSandi selinger           L2_nrows    = 1;
10407660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10417660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10427660c4dbSandi selinger           /* Copy to workj_L2 */
1043d7ed1a4dSandi selinger           if (rowsleft) {
10447660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1045d7ed1a4dSandi selinger           }
1046d7ed1a4dSandi selinger         }
1047d7ed1a4dSandi selinger       }
1048d7ed1a4dSandi selinger     }  /* while (rowsleft) */
1049d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1050d7ed1a4dSandi selinger 
1051d7ed1a4dSandi selinger     /* terminate current row */
1052d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1053d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
1054d7ed1a4dSandi selinger   }
1055d7ed1a4dSandi selinger 
1056d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
1057e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
10584222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1059d7ed1a4dSandi selinger 
1060d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1061d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10624222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1063d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1064d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1065d7ed1a4dSandi selinger   c->nonew   = 0;
1066d7ed1a4dSandi selinger 
10674222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1068d7ed1a4dSandi selinger 
1069d7ed1a4dSandi selinger   /* set MatInfo */
1070d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1071d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
1072d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
1073d7ed1a4dSandi selinger   c->nz                        = ci[am];
10744222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10754222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10764222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1077d7ed1a4dSandi selinger 
1078d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1079d7ed1a4dSandi selinger   if (ci[am]) {
10804222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
10814222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1082d7ed1a4dSandi selinger   } else {
10834222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1084d7ed1a4dSandi selinger   }
1085d7ed1a4dSandi selinger #endif
1086d7ed1a4dSandi selinger 
1087d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1088d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1089d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1090f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1091d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1092d7ed1a4dSandi selinger }
1093d7ed1a4dSandi selinger 
1094cd093f1dSJed Brown /* concatenate unique entries and then sort */
10954222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1096cd093f1dSJed Brown {
1097cd093f1dSJed Brown   PetscErrorCode ierr;
1098cd093f1dSJed Brown   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1099cd093f1dSJed Brown   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
11008c78258cSHong Zhang   PetscInt       *ci,*cj,bcol;
1101cd093f1dSJed Brown   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1102cd093f1dSJed Brown   PetscReal      afill;
1103cd093f1dSJed Brown   PetscInt       i,j,ndouble = 0;
1104cd093f1dSJed Brown   PetscSegBuffer seg,segrow;
1105cd093f1dSJed Brown   char           *seen;
1106cd093f1dSJed Brown 
1107cd093f1dSJed Brown   PetscFunctionBegin;
1108854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1109cd093f1dSJed Brown   ci[0] = 0;
1110cd093f1dSJed Brown 
1111cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1112cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1113cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1114580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1115cd093f1dSJed Brown 
1116cd093f1dSJed Brown   /* Determine ci and cj */
1117cd093f1dSJed Brown   for (i=0; i<am; i++) {
1118cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1119cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1120cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
11218c78258cSHong Zhang 
1122cd093f1dSJed Brown     /* Pack segrow */
1123cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1124cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1125cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
11268c78258cSHong Zhang         bcol = bj[k];
1127cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1128cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1129cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1130cd093f1dSJed Brown           *slot = bcol;
1131cd093f1dSJed Brown           seen[bcol] = 1;
1132cd093f1dSJed Brown           packlen++;
1133cd093f1dSJed Brown         }
1134cd093f1dSJed Brown       }
1135cd093f1dSJed Brown     }
11368c78258cSHong Zhang 
11378c78258cSHong Zhang     /* Check i-th diagonal entry */
11388c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11398c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11408c78258cSHong Zhang       ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
11418c78258cSHong Zhang       *slot   = i;
11428c78258cSHong Zhang       seen[i] = 1;
11438c78258cSHong Zhang       packlen++;
11448c78258cSHong Zhang     }
11458c78258cSHong Zhang 
1146cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1147cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1148cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1149cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1150cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1151cd093f1dSJed Brown   }
1152cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1153cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1154cd093f1dSJed Brown 
1155cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1156cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1157cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1158cd093f1dSJed Brown 
1159cd093f1dSJed Brown   /* put together the new symbolic matrix */
1160e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
11614222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1162cd093f1dSJed Brown 
1163cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1164cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11654222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1166cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1167cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1168cd093f1dSJed Brown   c->nonew   = 0;
1169cd093f1dSJed Brown 
11704222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1171cd093f1dSJed Brown 
1172cd093f1dSJed Brown   /* set MatInfo */
11732a09556fSStefano Zampini   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1174cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1175cd093f1dSJed Brown   c->maxnz                  = ci[am];
1176cd093f1dSJed Brown   c->nz                     = ci[am];
11774222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11784222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11794222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1180cd093f1dSJed Brown 
1181cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1182cd093f1dSJed Brown   if (ci[am]) {
11834222ddf1SHong Zhang     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
11844222ddf1SHong Zhang     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1185cd093f1dSJed Brown   } else {
11864222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1187cd093f1dSJed Brown   }
1188cd093f1dSJed Brown #endif
1189cd093f1dSJed Brown   PetscFunctionReturn(0);
1190cd093f1dSJed Brown }
1191cd093f1dSJed Brown 
11926718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
11932128a86cSHong Zhang {
11942128a86cSHong Zhang   PetscErrorCode      ierr;
11956718818eSStefano Zampini   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
11962128a86cSHong Zhang 
11972128a86cSHong Zhang   PetscFunctionBegin;
119840192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
119940192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
120040192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
120140192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
12022128a86cSHong Zhang   PetscFunctionReturn(0);
12032128a86cSHong Zhang }
12042128a86cSHong Zhang 
12054222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1206bc011b1eSHong Zhang {
12075df89d91SHong Zhang   PetscErrorCode      ierr;
120881d82fe4SHong Zhang   Mat                 Bt;
120981d82fe4SHong Zhang   PetscInt            *bti,*btj;
121040192850SHong Zhang   Mat_MatMatTransMult *abt;
12114222ddf1SHong Zhang   Mat_Product         *product = C->product;
12126718818eSStefano Zampini   char                *alg;
1213d2853540SHong Zhang 
121481d82fe4SHong Zhang   PetscFunctionBegin;
12156718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
12166718818eSStefano Zampini   if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
12176718818eSStefano Zampini 
121881d82fe4SHong Zhang   /* create symbolic Bt */
121981d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12200298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
122133d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
122204b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
122381d82fe4SHong Zhang 
122481d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12256718818eSStefano Zampini   ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr);
12264222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
122781d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
12284222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
12296718818eSStefano Zampini   ierr = PetscFree(alg);CHKERRQ(ierr);
123081d82fe4SHong Zhang 
12312128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1232b00a9115SJed Brown   ierr = PetscNew(&abt);CHKERRQ(ierr);
12332128a86cSHong Zhang 
12346718818eSStefano Zampini   product->data    = abt;
12356718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12366718818eSStefano Zampini 
12374222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12382128a86cSHong Zhang 
12394222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12404222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
124140192850SHong Zhang   if (abt->usecoloring) {
1242b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1243b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1244335efc43SPeter Brune     MatColoring          coloring;
12452128a86cSHong Zhang     ISColoring           iscoloring;
12462128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
1247e8354b3eSHong Zhang 
12484222ddf1SHong Zhang     /* inode causes memory problem */
12494222ddf1SHong Zhang     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
12504222ddf1SHong Zhang 
12514222ddf1SHong Zhang     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1252335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1253335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1254335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1255335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1256335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
12574222ddf1SHong Zhang     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
12582205254eSKarl Rupp 
125940192850SHong Zhang     abt->matcoloring = matcoloring;
12602205254eSKarl Rupp 
12612128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
12622128a86cSHong Zhang 
12632128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12642128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
12652128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12662128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
12670298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
12682205254eSKarl Rupp 
12692128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
127040192850SHong Zhang     abt->Bt_den         = Bt_dense;
12712128a86cSHong Zhang 
12722128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
12732128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12742128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
12750298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
12762205254eSKarl Rupp 
12772128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
127840192850SHong Zhang     abt->ABt_den  = C_dense;
1279f94ccd6cSHong Zhang 
1280f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12811ce71dffSSatish Balay     {
12824222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
12834222ddf1SHong Zhang       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
12841ce71dffSSatish Balay     }
1285f94ccd6cSHong Zhang #endif
12862128a86cSHong Zhang   }
1287e99be685SHong Zhang   /* clean up */
1288e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1289e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12905df89d91SHong Zhang   PetscFunctionReturn(0);
12915df89d91SHong Zhang }
12925df89d91SHong Zhang 
12936fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12945df89d91SHong Zhang {
12955df89d91SHong Zhang   PetscErrorCode      ierr;
12965df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1297e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
129881d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12995df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1300aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
13016718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13026718818eSStefano Zampini   Mat_Product         *product = C->product;
13035df89d91SHong Zhang 
13045df89d91SHong Zhang   PetscFunctionBegin;
13056718818eSStefano Zampini   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
13066718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
13076718818eSStefano Zampini   if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
130858ed3ceaSHong Zhang   /* clear old values in C */
130958ed3ceaSHong Zhang   if (!c->a) {
1310580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
131158ed3ceaSHong Zhang     c->a      = ca;
131258ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
131358ed3ceaSHong Zhang   } else {
131458ed3ceaSHong Zhang     ca =  c->a;
1315580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
131658ed3ceaSHong Zhang   }
131758ed3ceaSHong Zhang 
131840192850SHong Zhang   if (abt->usecoloring) {
131940192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
132040192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1321c8db22f6SHong Zhang 
1322b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
132340192850SHong Zhang     Bt_dense = abt->Bt_den;
1324b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1325c8db22f6SHong Zhang 
1326c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13272128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1328c8db22f6SHong Zhang 
13292128a86cSHong Zhang     /* Recover C from C_dense */
1330b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1331c8db22f6SHong Zhang     PetscFunctionReturn(0);
1332c8db22f6SHong Zhang   }
1333c8db22f6SHong Zhang 
13341fa1209cSHong Zhang   for (i=0; i<cm; i++) {
133581d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
133681d82fe4SHong Zhang     acol = aj + ai[i];
13376973769bSHong Zhang     aval = aa + ai[i];
13381fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
13391fa1209cSHong Zhang     ccol = cj + ci[i];
13406973769bSHong Zhang     cval = ca + ci[i];
13411fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
134281d82fe4SHong Zhang       brow = ccol[j];
134381d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
134481d82fe4SHong Zhang       bcol = bj + bi[brow];
13456973769bSHong Zhang       bval = ba + bi[brow];
13466973769bSHong Zhang 
134781d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
134881d82fe4SHong Zhang       nexta = 0; nextb = 0;
134981d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
13507b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
135181d82fe4SHong Zhang         if (nexta == anzi) break;
13527b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
135381d82fe4SHong Zhang         if (nextb == bnzj) break;
135481d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13556973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
135681d82fe4SHong Zhang           nexta++; nextb++;
135781d82fe4SHong Zhang           flops += 2;
13581fa1209cSHong Zhang         }
13591fa1209cSHong Zhang       }
136081d82fe4SHong Zhang     }
136181d82fe4SHong Zhang   }
136281d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
136381d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
136481d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
13655df89d91SHong Zhang   PetscFunctionReturn(0);
13665df89d91SHong Zhang }
13675df89d91SHong Zhang 
13686718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
13696d373c3eSHong Zhang {
13706d373c3eSHong Zhang   PetscErrorCode      ierr;
13716718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
13726d373c3eSHong Zhang 
13736d373c3eSHong Zhang   PetscFunctionBegin;
13746d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
13756718818eSStefano Zampini   if (atb->destroy) {
13766718818eSStefano Zampini     ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr);
13776473ade5SStefano Zampini   }
13786d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
13796d373c3eSHong Zhang   PetscFunctionReturn(0);
13806d373c3eSHong Zhang }
13816d373c3eSHong Zhang 
13824222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
13835df89d91SHong Zhang {
1384bc011b1eSHong Zhang   PetscErrorCode ierr;
1385089a957eSStefano Zampini   Mat            At = NULL;
138638baddfdSBarry Smith   PetscInt       *ati,*atj;
13874222ddf1SHong Zhang   Mat_Product    *product = C->product;
1388089a957eSStefano Zampini   PetscBool      flg,def,square;
1389bc011b1eSHong Zhang 
1390bc011b1eSHong Zhang   PetscFunctionBegin;
1391089a957eSStefano Zampini   MatCheckProduct(C,4);
1392089a957eSStefano Zampini   square = (PetscBool)(A == B && A->symmetric && A->symmetric_set);
13934222ddf1SHong Zhang   /* outerproduct */
1394089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr);
13954222ddf1SHong Zhang   if (flg) {
1396bc011b1eSHong Zhang     /* create symbolic At */
1397089a957eSStefano Zampini     if (!square) {
1398bc011b1eSHong Zhang       ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13990298fd71SBarry Smith       ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
140033d57670SJed Brown       ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
140104b858e0SBarry Smith       ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1402089a957eSStefano Zampini     }
1403bc011b1eSHong Zhang     /* get symbolic C=At*B */
14047a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1405089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
1406bc011b1eSHong Zhang 
1407bc011b1eSHong Zhang     /* clean up */
1408089a957eSStefano Zampini     if (!square) {
14096bf464f9SBarry Smith       ierr = MatDestroy(&At);CHKERRQ(ierr);
1410bc011b1eSHong Zhang       ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1411089a957eSStefano Zampini     }
14126d373c3eSHong Zhang 
14134222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14147a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr);
14154222ddf1SHong Zhang     PetscFunctionReturn(0);
14164222ddf1SHong Zhang   }
14174222ddf1SHong Zhang 
14184222ddf1SHong Zhang   /* matmatmult */
1419089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr);
1420089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr);
1421089a957eSStefano Zampini   if (flg || def) {
14224222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14234222ddf1SHong Zhang 
14246718818eSStefano Zampini     if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
14254222ddf1SHong Zhang     ierr = PetscNew(&atb);CHKERRQ(ierr);
1426089a957eSStefano Zampini     if (!square) {
14274222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1428089a957eSStefano Zampini     }
14297a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1430089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
14316718818eSStefano Zampini     ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr);
14326718818eSStefano Zampini     product->data    = atb;
14336718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14344222ddf1SHong Zhang     atb->At          = At;
14354222ddf1SHong Zhang     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
14364222ddf1SHong Zhang 
14374222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14384222ddf1SHong Zhang     PetscFunctionReturn(0);
14394222ddf1SHong Zhang   }
14404222ddf1SHong Zhang 
14414222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1442bc011b1eSHong Zhang }
1443bc011b1eSHong Zhang 
144475648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1445bc011b1eSHong Zhang {
1446bc011b1eSHong Zhang   PetscErrorCode ierr;
14470fbc74f4SHong Zhang   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1448d0f46423SBarry Smith   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1449d0f46423SBarry Smith   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1450d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1451aa1aec99SHong Zhang   MatScalar      *aa=a->a,*ba,*ca,*caj;
1452bc011b1eSHong Zhang 
1453bc011b1eSHong Zhang   PetscFunctionBegin;
1454aa1aec99SHong Zhang   if (!c->a) {
1455580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
14562205254eSKarl Rupp 
1457aa1aec99SHong Zhang     c->a      = ca;
1458aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1459aa1aec99SHong Zhang   } else {
1460aa1aec99SHong Zhang     ca   = c->a;
1461580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1462aa1aec99SHong Zhang   }
1463bc011b1eSHong Zhang 
1464bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1465bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1466bc011b1eSHong Zhang     bj   = b->j + bi[i];
1467bc011b1eSHong Zhang     ba   = b->a + bi[i];
1468bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1469bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1470bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1471bc011b1eSHong Zhang       nextb = 0;
14720fbc74f4SHong Zhang       crow  = *aj++;
1473bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1474bc011b1eSHong Zhang       caj   = ca + ci[crow];
1475bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1476bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14770fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14780fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1479bc011b1eSHong Zhang           nextb++;
1480bc011b1eSHong Zhang         }
1481bc011b1eSHong Zhang       }
1482bc011b1eSHong Zhang       flops += 2*bnzi;
14830fbc74f4SHong Zhang       aa++;
1484bc011b1eSHong Zhang     }
1485bc011b1eSHong Zhang   }
1486bc011b1eSHong Zhang 
1487bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1488bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1489bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1490bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1491bc011b1eSHong Zhang   PetscFunctionReturn(0);
1492bc011b1eSHong Zhang }
14939123193aSHong Zhang 
14944222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
14959123193aSHong Zhang {
14969123193aSHong Zhang   PetscErrorCode ierr;
14979123193aSHong Zhang 
14989123193aSHong Zhang   PetscFunctionBegin;
14995a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
15004222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
15019123193aSHong Zhang   PetscFunctionReturn(0);
15029123193aSHong Zhang }
15039123193aSHong Zhang 
150493aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
15059123193aSHong Zhang {
1506f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1507612438f1SStefano Zampini   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
15081ca9667aSStefano Zampini   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
15099123193aSHong Zhang   PetscErrorCode    ierr;
15101ca9667aSStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1511a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1512daf57211SHong Zhang   const PetscInt    *aj;
1513612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
15141ca9667aSStefano Zampini   PetscInt          clda=cd->lda;
15151ca9667aSStefano Zampini   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
15169123193aSHong Zhang 
15179123193aSHong Zhang   PetscFunctionBegin;
1518f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1519a4af7ca8SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
152093aa15f2SStefano Zampini   if (add) {
15218c778c55SBarry Smith     ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
152293aa15f2SStefano Zampini   } else {
152393aa15f2SStefano Zampini     ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr);
152493aa15f2SStefano Zampini   }
1525a4af7ca8SStefano Zampini   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1526f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
15271ca9667aSStefano Zampini   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
15281ca9667aSStefano Zampini   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
15291ca9667aSStefano Zampini     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1530f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1531f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1532f32d5d43SBarry Smith       aj = a->j + a->i[i];
1533a4af7ca8SStefano Zampini       aa = av + a->i[i];
1534f32d5d43SBarry Smith       for (j=0; j<n; j++) {
15351ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15361ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1537730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1538730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1539730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1540730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
15419123193aSHong Zhang       }
154293aa15f2SStefano Zampini       if (add) {
154387753ddeSHong Zhang         c1[i] += r1;
154487753ddeSHong Zhang         c2[i] += r2;
154587753ddeSHong Zhang         c3[i] += r3;
154687753ddeSHong Zhang         c4[i] += r4;
154793aa15f2SStefano Zampini       } else {
154893aa15f2SStefano Zampini         c1[i] = r1;
154993aa15f2SStefano Zampini         c2[i] = r2;
155093aa15f2SStefano Zampini         c3[i] = r3;
155193aa15f2SStefano Zampini         c4[i] = r4;
155293aa15f2SStefano Zampini       }
1553f32d5d43SBarry Smith     }
1554730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1555730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1556f32d5d43SBarry Smith   }
155793aa15f2SStefano Zampini   /* process remaining columns */
155893aa15f2SStefano Zampini   if (col != cn) {
155993aa15f2SStefano Zampini     PetscInt rc = cn-col;
156093aa15f2SStefano Zampini 
156193aa15f2SStefano Zampini     if (rc == 1) {
156293aa15f2SStefano Zampini       for (i=0; i<am; i++) {
1563f32d5d43SBarry Smith         r1 = 0.0;
1564f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1565f32d5d43SBarry Smith         aj = a->j + a->i[i];
1566a4af7ca8SStefano Zampini         aa = av + a->i[i];
156793aa15f2SStefano Zampini         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
156893aa15f2SStefano Zampini         if (add) c1[i] += r1;
156993aa15f2SStefano Zampini         else c1[i] = r1;
157093aa15f2SStefano Zampini       }
157193aa15f2SStefano Zampini     } else if (rc == 2) {
157293aa15f2SStefano Zampini       for (i=0; i<am; i++) {
157393aa15f2SStefano Zampini         r1 = r2 = 0.0;
157493aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
157593aa15f2SStefano Zampini         aj = a->j + a->i[i];
157693aa15f2SStefano Zampini         aa = av + a->i[i];
1577f32d5d43SBarry Smith         for (j=0; j<n; j++) {
157893aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
157993aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
158093aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
158193aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
1582f32d5d43SBarry Smith         }
158393aa15f2SStefano Zampini         if (add) {
158487753ddeSHong Zhang           c1[i] += r1;
158593aa15f2SStefano Zampini           c2[i] += r2;
158693aa15f2SStefano Zampini         } else {
158793aa15f2SStefano Zampini           c1[i] = r1;
158893aa15f2SStefano Zampini           c2[i] = r2;
1589f32d5d43SBarry Smith         }
159093aa15f2SStefano Zampini       }
159193aa15f2SStefano Zampini     } else {
159293aa15f2SStefano Zampini       for (i=0; i<am; i++) {
159393aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
159493aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
159593aa15f2SStefano Zampini         aj = a->j + a->i[i];
159693aa15f2SStefano Zampini         aa = av + a->i[i];
159793aa15f2SStefano Zampini         for (j=0; j<n; j++) {
159893aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
159993aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
160093aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
160193aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
160293aa15f2SStefano Zampini           r3 += aatmp*b3[ajtmp];
160393aa15f2SStefano Zampini         }
160493aa15f2SStefano Zampini         if (add) {
160593aa15f2SStefano Zampini           c1[i] += r1;
160693aa15f2SStefano Zampini           c2[i] += r2;
160793aa15f2SStefano Zampini           c3[i] += r3;
160893aa15f2SStefano Zampini         } else {
160993aa15f2SStefano Zampini           c1[i] = r1;
161093aa15f2SStefano Zampini           c2[i] = r2;
161193aa15f2SStefano Zampini           c3[i] = r3;
161293aa15f2SStefano Zampini         }
161393aa15f2SStefano Zampini       }
161493aa15f2SStefano Zampini     }
1615f32d5d43SBarry Smith   }
1616dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
161793aa15f2SStefano Zampini   if (add) {
16188c778c55SBarry Smith     ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
161993aa15f2SStefano Zampini   } else {
162093aa15f2SStefano Zampini     ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr);
162193aa15f2SStefano Zampini   }
1622a4af7ca8SStefano Zampini   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1623a4af7ca8SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1624f32d5d43SBarry Smith   PetscFunctionReturn(0);
1625f32d5d43SBarry Smith }
1626f32d5d43SBarry Smith 
162787753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1628f32d5d43SBarry Smith {
1629f32d5d43SBarry Smith   PetscErrorCode ierr;
1630f32d5d43SBarry Smith 
1631f32d5d43SBarry Smith   PetscFunctionBegin;
163287753ddeSHong Zhang   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
163387753ddeSHong Zhang   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
163487753ddeSHong Zhang   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
16354324174eSBarry Smith 
163693aa15f2SStefano Zampini   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr);
16379123193aSHong Zhang   PetscFunctionReturn(0);
16389123193aSHong Zhang }
1639b1683b59SHong Zhang 
16404222ddf1SHong Zhang /* ------------------------------------------------------- */
16414222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
16424222ddf1SHong Zhang {
16434222ddf1SHong Zhang   PetscFunctionBegin;
16444222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16454222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16464222ddf1SHong Zhang   PetscFunctionReturn(0);
16474222ddf1SHong Zhang }
16484222ddf1SHong Zhang 
16496718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
16506718818eSStefano Zampini 
16514222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
16524222ddf1SHong Zhang {
16534222ddf1SHong Zhang   PetscFunctionBegin;
16546718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16554222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16566718818eSStefano Zampini   PetscFunctionReturn(0);
16576718818eSStefano Zampini }
16586718818eSStefano Zampini 
16596718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
16606718818eSStefano Zampini {
16616718818eSStefano Zampini   PetscFunctionBegin;
16626718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16636718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16644222ddf1SHong Zhang   PetscFunctionReturn(0);
16654222ddf1SHong Zhang }
16664222ddf1SHong Zhang 
16674222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
16684222ddf1SHong Zhang {
16694222ddf1SHong Zhang   PetscErrorCode ierr;
16704222ddf1SHong Zhang   Mat_Product    *product = C->product;
16714222ddf1SHong Zhang 
16724222ddf1SHong Zhang   PetscFunctionBegin;
16734222ddf1SHong Zhang   switch (product->type) {
16744222ddf1SHong Zhang   case MATPRODUCT_AB:
16754222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
16764222ddf1SHong Zhang     break;
16774222ddf1SHong Zhang   case MATPRODUCT_AtB:
16784222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
16794222ddf1SHong Zhang     break;
16806718818eSStefano Zampini   case MATPRODUCT_ABt:
16816718818eSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr);
16824222ddf1SHong Zhang     break;
16834222ddf1SHong Zhang   default:
16846718818eSStefano Zampini     break;
16854222ddf1SHong Zhang   }
16864222ddf1SHong Zhang   PetscFunctionReturn(0);
16874222ddf1SHong Zhang }
16884222ddf1SHong Zhang /* ------------------------------------------------------- */
16894222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
16904222ddf1SHong Zhang {
16914222ddf1SHong Zhang   PetscErrorCode ierr;
16924222ddf1SHong Zhang   Mat_Product    *product = C->product;
16934222ddf1SHong Zhang   Mat            A = product->A;
16944222ddf1SHong Zhang   PetscBool      baij;
16954222ddf1SHong Zhang 
16964222ddf1SHong Zhang   PetscFunctionBegin;
16974222ddf1SHong Zhang   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
16984222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
16994222ddf1SHong Zhang     PetscBool sbaij;
17004222ddf1SHong Zhang     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
17014222ddf1SHong Zhang     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
17024222ddf1SHong Zhang 
17034222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17044222ddf1SHong Zhang   } else { /* A is seqbaij */
17054222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17064222ddf1SHong Zhang   }
17074222ddf1SHong Zhang 
17084222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17094222ddf1SHong Zhang   PetscFunctionReturn(0);
17104222ddf1SHong Zhang }
17114222ddf1SHong Zhang 
17124222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
17134222ddf1SHong Zhang {
17144222ddf1SHong Zhang   PetscErrorCode ierr;
17154222ddf1SHong Zhang   Mat_Product    *product = C->product;
17164222ddf1SHong Zhang 
17174222ddf1SHong Zhang   PetscFunctionBegin;
17186718818eSStefano Zampini   MatCheckProduct(C,1);
17196718818eSStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
17206718818eSStefano Zampini   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
17214222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
17226718818eSStefano Zampini   }
17234222ddf1SHong Zhang   PetscFunctionReturn(0);
17244222ddf1SHong Zhang }
17256718818eSStefano Zampini 
17264222ddf1SHong Zhang /* ------------------------------------------------------- */
17274222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
17284222ddf1SHong Zhang {
17294222ddf1SHong Zhang   PetscFunctionBegin;
17304222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17314222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17324222ddf1SHong Zhang   PetscFunctionReturn(0);
17334222ddf1SHong Zhang }
17344222ddf1SHong Zhang 
17354222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
17364222ddf1SHong Zhang {
17374222ddf1SHong Zhang   PetscErrorCode ierr;
17384222ddf1SHong Zhang   Mat_Product    *product = C->product;
17394222ddf1SHong Zhang 
17404222ddf1SHong Zhang   PetscFunctionBegin;
17414222ddf1SHong Zhang   if (product->type == MATPRODUCT_AB) {
17424222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
17436718818eSStefano Zampini   }
17444222ddf1SHong Zhang   PetscFunctionReturn(0);
17454222ddf1SHong Zhang }
17464222ddf1SHong Zhang /* ------------------------------------------------------- */
17474222ddf1SHong Zhang 
1748b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1749c8db22f6SHong Zhang {
1750c8db22f6SHong Zhang   PetscErrorCode ierr;
17512f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
17522f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
17532f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
17542f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
17552f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
17562f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1757c8db22f6SHong Zhang 
1758c8db22f6SHong Zhang   PetscFunctionBegin;
17592f5f1f90SHong Zhang   btval_den=btdense->v;
1760580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
17612f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
17622f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17632f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1764d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17652f5f1f90SHong Zhang       btcol = bj + bi[col];
17662f5f1f90SHong Zhang       btval = ba + bi[col];
17672f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1768d2853540SHong Zhang       for (j=0; j<anz; j++) {
17692f5f1f90SHong Zhang         brow            = btcol[j];
17702f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1771c8db22f6SHong Zhang       }
1772c8db22f6SHong Zhang     }
17732f5f1f90SHong Zhang     btval_den += m;
1774c8db22f6SHong Zhang   }
1775c8db22f6SHong Zhang   PetscFunctionReturn(0);
1776c8db22f6SHong Zhang }
1777c8db22f6SHong Zhang 
1778b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
17798972f759SHong Zhang {
17808972f759SHong Zhang   PetscErrorCode    ierr;
1781b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
17821683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
17831683a169SBarry Smith   PetscScalar       *ca=csp->a;
1784f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1785e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1786077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1787077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
17888972f759SHong Zhang 
17898972f759SHong Zhang   PetscFunctionBegin;
17901683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1791f99a636bSHong Zhang 
1792077f23c2SHong Zhang   if (brows > 0) {
1793077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1794980ae229SHong Zhang     lstart = matcoloring->lstart;
1795580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1796eeb4fd02SHong Zhang 
1797077f23c2SHong Zhang     row_end = brows;
1798eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1799077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1800077f23c2SHong Zhang       ca_den_ptr = ca_den;
1801980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1802eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1803eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1804eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1805eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1806eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1807eeb4fd02SHong Zhang             lstart[k] = l;
1808eeb4fd02SHong Zhang             break;
1809eeb4fd02SHong Zhang           } else {
1810077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1811eeb4fd02SHong Zhang           }
1812eeb4fd02SHong Zhang         }
1813077f23c2SHong Zhang         ca_den_ptr += m;
1814eeb4fd02SHong Zhang       }
1815077f23c2SHong Zhang       row_end += brows;
1816eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1817eeb4fd02SHong Zhang     }
1818077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1819077f23c2SHong Zhang     ca_den_ptr = ca_den;
1820077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1821077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1822077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1823077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1824077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1825077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1826077f23c2SHong Zhang       }
1827077f23c2SHong Zhang       ca_den_ptr += m;
1828077f23c2SHong Zhang     }
1829f99a636bSHong Zhang   }
1830f99a636bSHong Zhang 
18311683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1832f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1833077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1834f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1835e88777f2SHong Zhang   } else {
1836077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1837e88777f2SHong Zhang   }
1838f99a636bSHong Zhang #endif
18398972f759SHong Zhang   PetscFunctionReturn(0);
18408972f759SHong Zhang }
18418972f759SHong Zhang 
1842b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1843b1683b59SHong Zhang {
1844b1683b59SHong Zhang   PetscErrorCode ierr;
1845e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
18461a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1847b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1848b1683b59SHong Zhang   IS             *isa;
1849b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1850e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1851e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1852e88777f2SHong Zhang   PetscBool      flg;
1853b1683b59SHong Zhang 
1854b1683b59SHong Zhang   PetscFunctionBegin;
1855071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1856e99be685SHong Zhang 
18577ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1858e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1859b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1860e88777f2SHong Zhang   c->N      = Nbs;
1861e88777f2SHong Zhang   c->m      = c->M;
1862b1683b59SHong Zhang   c->rstart = 0;
1863e88777f2SHong Zhang   c->brows  = 100;
1864b1683b59SHong Zhang 
1865b1683b59SHong Zhang   c->ncolors = nis;
1866dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1867854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1868854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1869e88777f2SHong Zhang 
1870e88777f2SHong Zhang   brows = c->brows;
1871c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1872e88777f2SHong Zhang   if (flg) c->brows = brows;
1873eeb4fd02SHong Zhang   if (brows > 0) {
1874854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1875e88777f2SHong Zhang   }
18762205254eSKarl Rupp 
1877d2853540SHong Zhang   colorforrow[0] = 0;
1878d2853540SHong Zhang   rows_i         = rows;
1879f99a636bSHong Zhang   den2sp_i       = den2sp;
1880b1683b59SHong Zhang 
1881854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1882854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
18832205254eSKarl Rupp 
1884d2853540SHong Zhang   colorforcol[0] = 0;
1885d2853540SHong Zhang   columns_i      = columns;
1886d2853540SHong Zhang 
1887077f23c2SHong Zhang   /* get column-wise storage of mat */
1888077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1889b1683b59SHong Zhang 
1890b28d80bdSHong Zhang   cm   = c->m;
1891854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1892854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1893f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1894b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1895b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
18962205254eSKarl Rupp 
1897b1683b59SHong Zhang     c->ncolumns[i] = n;
1898b1683b59SHong Zhang     if (n) {
1899580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1900b1683b59SHong Zhang     }
1901d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1902d2853540SHong Zhang     columns_i       += n;
1903b1683b59SHong Zhang 
1904b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1905580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1906e99be685SHong Zhang 
1907b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1908b1683b59SHong Zhang       col     = is[j];
1909d2853540SHong Zhang       row_idx = cj + ci[col];
1910b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1911b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1912e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1913d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1914b1683b59SHong Zhang       }
1915b1683b59SHong Zhang     }
1916b1683b59SHong Zhang     /* count the number of hits */
1917b1683b59SHong Zhang     nrows = 0;
1918e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1919b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1920b1683b59SHong Zhang     }
1921b1683b59SHong Zhang     c->nrows[i]      = nrows;
1922d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1923d2853540SHong Zhang 
1924b1683b59SHong Zhang     nrows = 0;
1925b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1926b1683b59SHong Zhang       if (rowhit[j]) {
1927d2853540SHong Zhang         rows_i[nrows]   = j;
192812b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1929b1683b59SHong Zhang         nrows++;
1930b1683b59SHong Zhang       }
1931b1683b59SHong Zhang     }
1932e88777f2SHong Zhang     den2sp_i += nrows;
1933077f23c2SHong Zhang 
1934b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1935f99a636bSHong Zhang     rows_i += nrows;
1936b1683b59SHong Zhang   }
19370298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1938b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1939071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1940d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1941b28d80bdSHong Zhang 
1942d2853540SHong Zhang   c->colorforrow = colorforrow;
1943d2853540SHong Zhang   c->rows        = rows;
1944f99a636bSHong Zhang   c->den2sp      = den2sp;
1945d2853540SHong Zhang   c->colorforcol = colorforcol;
1946d2853540SHong Zhang   c->columns     = columns;
19472205254eSKarl Rupp 
1948f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1949b1683b59SHong Zhang   PetscFunctionReturn(0);
1950b1683b59SHong Zhang }
1951d013fc79Sandi selinger 
19524222ddf1SHong Zhang /* --------------------------------------------------------------- */
19534222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1954df97dc6dSFande Kong {
19554222ddf1SHong Zhang   PetscErrorCode ierr;
19564222ddf1SHong Zhang   Mat_Product    *product = C->product;
19574222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19584222ddf1SHong Zhang 
1959df97dc6dSFande Kong   PetscFunctionBegin;
19604222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19614222ddf1SHong Zhang     /* Alg: "outerproduct" */
19626718818eSStefano Zampini     ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
19634222ddf1SHong Zhang   } else {
19644222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19656718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19666718818eSStefano Zampini     Mat                 At;
19674222ddf1SHong Zhang 
19686718818eSStefano Zampini     if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
19696718818eSStefano Zampini     At = atb->At;
1970089a957eSStefano Zampini     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
19714222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
19724222ddf1SHong Zhang     }
1973089a957eSStefano Zampini     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr);
19744222ddf1SHong Zhang     atb->updateAt = PETSC_TRUE;
19754222ddf1SHong Zhang   }
1976df97dc6dSFande Kong   PetscFunctionReturn(0);
1977df97dc6dSFande Kong }
1978df97dc6dSFande Kong 
19794222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1980d013fc79Sandi selinger {
1981d013fc79Sandi selinger   PetscErrorCode ierr;
19824222ddf1SHong Zhang   Mat_Product    *product = C->product;
19834222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19844222ddf1SHong Zhang   PetscReal      fill=product->fill;
1985d013fc79Sandi selinger 
1986d013fc79Sandi selinger   PetscFunctionBegin;
19874222ddf1SHong Zhang   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
19882869b61bSandi selinger 
19894222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19904222ddf1SHong Zhang   PetscFunctionReturn(0);
19912869b61bSandi selinger }
1992d013fc79Sandi selinger 
19934222ddf1SHong Zhang /* --------------------------------------------------------------- */
19944222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
19954222ddf1SHong Zhang {
19964222ddf1SHong Zhang   PetscErrorCode ierr;
19974222ddf1SHong Zhang   Mat_Product    *product = C->product;
19984222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19994222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20004222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20014222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
20024222ddf1SHong Zhang   PetscInt       nalg = 7;
20034222ddf1SHong Zhang #else
20044222ddf1SHong Zhang   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
20054222ddf1SHong Zhang   PetscInt       nalg = 8;
20064222ddf1SHong Zhang #endif
20074222ddf1SHong Zhang 
20084222ddf1SHong Zhang   PetscFunctionBegin;
20094222ddf1SHong Zhang   /* Set default algorithm */
20104222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20114222ddf1SHong Zhang   if (flg) {
20124222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2013d013fc79Sandi selinger   }
2014d013fc79Sandi selinger 
20154222ddf1SHong Zhang   /* Get runtime option */
20164222ddf1SHong Zhang   if (product->api_user) {
20174222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
20184222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20194222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20204222ddf1SHong Zhang   } else {
20214222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
20224222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20234222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2024d013fc79Sandi selinger   }
20254222ddf1SHong Zhang   if (flg) {
20264222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2027d013fc79Sandi selinger   }
2028d013fc79Sandi selinger 
20294222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20304222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20314222ddf1SHong Zhang   PetscFunctionReturn(0);
20324222ddf1SHong Zhang }
2033d013fc79Sandi selinger 
20344222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
20354222ddf1SHong Zhang {
20364222ddf1SHong Zhang   PetscErrorCode ierr;
20374222ddf1SHong Zhang   Mat_Product    *product = C->product;
20384222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20394222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
2040089a957eSStefano Zampini   const char     *algTypes[3] = {"default","at*b","outerproduct"};
2041089a957eSStefano Zampini   PetscInt       nalg = 3;
2042d013fc79Sandi selinger 
20434222ddf1SHong Zhang   PetscFunctionBegin;
20444222ddf1SHong Zhang   /* Get runtime option */
20454222ddf1SHong Zhang   if (product->api_user) {
20464222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
20474222ddf1SHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20484222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20494222ddf1SHong Zhang   } else {
20504222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
20514222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20524222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20534222ddf1SHong Zhang   }
20544222ddf1SHong Zhang   if (flg) {
20554222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20564222ddf1SHong Zhang   }
2057d013fc79Sandi selinger 
20584222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20594222ddf1SHong Zhang   PetscFunctionReturn(0);
20604222ddf1SHong Zhang }
20614222ddf1SHong Zhang 
20624222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
20634222ddf1SHong Zhang {
20644222ddf1SHong Zhang   PetscErrorCode ierr;
20654222ddf1SHong Zhang   Mat_Product    *product = C->product;
20664222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20674222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20684222ddf1SHong Zhang   const char     *algTypes[2] = {"default","color"};
20694222ddf1SHong Zhang   PetscInt       nalg = 2;
20704222ddf1SHong Zhang 
20714222ddf1SHong Zhang   PetscFunctionBegin;
20724222ddf1SHong Zhang   /* Set default algorithm */
20734222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20744222ddf1SHong Zhang   if (!flg) {
20754222ddf1SHong Zhang     alg = 1;
20764222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20774222ddf1SHong Zhang   }
20784222ddf1SHong Zhang 
20794222ddf1SHong Zhang   /* Get runtime option */
20804222ddf1SHong Zhang   if (product->api_user) {
20814222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
20824222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20834222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20844222ddf1SHong Zhang   } else {
20854222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
20864222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20874222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20884222ddf1SHong Zhang   }
20894222ddf1SHong Zhang   if (flg) {
20904222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20914222ddf1SHong Zhang   }
20924222ddf1SHong Zhang 
20934222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20944222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20954222ddf1SHong Zhang   PetscFunctionReturn(0);
20964222ddf1SHong Zhang }
20974222ddf1SHong Zhang 
20984222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
20994222ddf1SHong Zhang {
21004222ddf1SHong Zhang   PetscErrorCode ierr;
21014222ddf1SHong Zhang   Mat_Product    *product = C->product;
21024222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21034222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
21044222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
21054222ddf1SHong Zhang   const char      *algTypes[2] = {"scalable","rap"};
21064222ddf1SHong Zhang   PetscInt        nalg = 2;
21074222ddf1SHong Zhang #else
21084222ddf1SHong Zhang   const char      *algTypes[3] = {"scalable","rap","hypre"};
21094222ddf1SHong Zhang   PetscInt        nalg = 3;
21104222ddf1SHong Zhang #endif
21114222ddf1SHong Zhang 
21124222ddf1SHong Zhang   PetscFunctionBegin;
21134222ddf1SHong Zhang   /* Set default algorithm */
21144222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21154222ddf1SHong Zhang   if (flg) {
21164222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21174222ddf1SHong Zhang   }
21184222ddf1SHong Zhang 
21194222ddf1SHong Zhang   /* Get runtime option */
21204222ddf1SHong Zhang   if (product->api_user) {
21214222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
21224222ddf1SHong Zhang     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21234222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21244222ddf1SHong Zhang   } else {
21254222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
21264222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21274222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21284222ddf1SHong Zhang   }
21294222ddf1SHong Zhang   if (flg) {
21304222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21314222ddf1SHong Zhang   }
21324222ddf1SHong Zhang 
21334222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21344222ddf1SHong Zhang   PetscFunctionReturn(0);
21354222ddf1SHong Zhang }
21364222ddf1SHong Zhang 
21374222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
21384222ddf1SHong Zhang {
21394222ddf1SHong Zhang   PetscErrorCode ierr;
21404222ddf1SHong Zhang   Mat_Product    *product = C->product;
21414222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21424222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21434222ddf1SHong Zhang   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
21444222ddf1SHong Zhang   PetscInt        nalg = 3;
21454222ddf1SHong Zhang 
21464222ddf1SHong Zhang   PetscFunctionBegin;
21474222ddf1SHong Zhang   /* Set default algorithm */
21484222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21494222ddf1SHong Zhang   if (flg) {
21504222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21514222ddf1SHong Zhang   }
21524222ddf1SHong Zhang 
21534222ddf1SHong Zhang   /* Get runtime option */
21544222ddf1SHong Zhang   if (product->api_user) {
21554222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
21564222ddf1SHong Zhang     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21574222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21584222ddf1SHong Zhang   } else {
21594222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
21604222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21614222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21624222ddf1SHong Zhang   }
21634222ddf1SHong Zhang   if (flg) {
21644222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21654222ddf1SHong Zhang   }
21664222ddf1SHong Zhang 
21674222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21684222ddf1SHong Zhang   PetscFunctionReturn(0);
21694222ddf1SHong Zhang }
21704222ddf1SHong Zhang 
21714222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
21724222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
21734222ddf1SHong Zhang {
21744222ddf1SHong Zhang   PetscErrorCode ierr;
21754222ddf1SHong Zhang   Mat_Product    *product = C->product;
21764222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21774222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21784222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
21794222ddf1SHong Zhang   PetscInt       nalg = 7;
21804222ddf1SHong Zhang 
21814222ddf1SHong Zhang   PetscFunctionBegin;
21824222ddf1SHong Zhang   /* Set default algorithm */
21834222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21844222ddf1SHong Zhang   if (flg) {
21854222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21864222ddf1SHong Zhang   }
21874222ddf1SHong Zhang 
21884222ddf1SHong Zhang   /* Get runtime option */
21894222ddf1SHong Zhang   if (product->api_user) {
21904222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
21914222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21924222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21934222ddf1SHong Zhang   } else {
21944222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
21954222ddf1SHong Zhang     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21964222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21974222ddf1SHong Zhang   }
21984222ddf1SHong Zhang   if (flg) {
21994222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
22004222ddf1SHong Zhang   }
22014222ddf1SHong Zhang 
22024222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
22034222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
22044222ddf1SHong Zhang   PetscFunctionReturn(0);
22054222ddf1SHong Zhang }
22064222ddf1SHong Zhang 
22074222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
22084222ddf1SHong Zhang {
22094222ddf1SHong Zhang   PetscErrorCode ierr;
22104222ddf1SHong Zhang   Mat_Product    *product = C->product;
22114222ddf1SHong Zhang 
22124222ddf1SHong Zhang   PetscFunctionBegin;
22134222ddf1SHong Zhang   switch (product->type) {
22144222ddf1SHong Zhang   case MATPRODUCT_AB:
22154222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
22164222ddf1SHong Zhang     break;
22174222ddf1SHong Zhang   case MATPRODUCT_AtB:
22184222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
22194222ddf1SHong Zhang     break;
22204222ddf1SHong Zhang   case MATPRODUCT_ABt:
22214222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
22224222ddf1SHong Zhang     break;
22234222ddf1SHong Zhang   case MATPRODUCT_PtAP:
22244222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
22254222ddf1SHong Zhang     break;
22264222ddf1SHong Zhang   case MATPRODUCT_RARt:
22274222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
22284222ddf1SHong Zhang     break;
22294222ddf1SHong Zhang   case MATPRODUCT_ABC:
22304222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
22314222ddf1SHong Zhang     break;
22326718818eSStefano Zampini   default:
22336718818eSStefano Zampini     break;
22344222ddf1SHong Zhang   }
2235d013fc79Sandi selinger   PetscFunctionReturn(0);
2236d013fc79Sandi selinger }
2237