xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision cbc6b2250e380677eada4bbc58a36dc55ca92067)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14df97dc6dSFande Kong {
15df97dc6dSFande Kong   PetscErrorCode ierr;
16df97dc6dSFande Kong 
17df97dc6dSFande Kong   PetscFunctionBegin;
18df97dc6dSFande Kong   if (C->ops->matmultnumeric) {
192c71b3e2SJacob Faibussowitsch     PetscCheckFalse(C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
20df97dc6dSFande Kong     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
21df97dc6dSFande Kong   } else {
22df97dc6dSFande Kong     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
23df97dc6dSFande Kong   }
24df97dc6dSFande Kong   PetscFunctionReturn(0);
25df97dc6dSFande Kong }
26df97dc6dSFande Kong 
274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
29df97dc6dSFande Kong {
30df97dc6dSFande Kong   PetscErrorCode ierr;
314222ddf1SHong Zhang   PetscInt       ii;
324222ddf1SHong Zhang   Mat_SeqAIJ     *aij;
33*cbc6b225SStefano Zampini   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
345c66b693SKris Buschelman 
355c66b693SKris Buschelman   PetscFunctionBegin;
362c71b3e2SJacob Faibussowitsch   PetscCheckFalse(m > 0 && i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
374222ddf1SHong Zhang   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
384222ddf1SHong Zhang 
39e4e71118SStefano Zampini   if (!mtype) {
40e4e71118SStefano Zampini     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
41e4e71118SStefano Zampini     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
42e4e71118SStefano Zampini   } else {
43e4e71118SStefano Zampini     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
44e4e71118SStefano Zampini   }
45*cbc6b225SStefano Zampini 
464222ddf1SHong Zhang   aij  = (Mat_SeqAIJ*)(mat)->data;
47*cbc6b225SStefano Zampini   osingle = aij->singlemalloc;
48*cbc6b225SStefano Zampini   ofree_a = aij->free_a;
49*cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
50*cbc6b225SStefano Zampini   /* changes the free flags */
51*cbc6b225SStefano Zampini   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
52*cbc6b225SStefano Zampini 
53*cbc6b225SStefano Zampini   ierr = PetscFree(aij->ilen);CHKERRQ(ierr);
54*cbc6b225SStefano Zampini   ierr = PetscFree(aij->imax);CHKERRQ(ierr);
554222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
564222ddf1SHong Zhang   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
57*cbc6b225SStefano Zampini   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
58*cbc6b225SStefano Zampini     const PetscInt rnz = i[ii+1] - i[ii];
59*cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
60*cbc6b225SStefano Zampini     aij->rmax = PetscMax(aij->rmax,rnz);
61*cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
62*cbc6b225SStefano Zampini   }
63*cbc6b225SStefano Zampini   aij->maxnz = i[m];
64*cbc6b225SStefano Zampini   aij->nz = i[m];
654222ddf1SHong Zhang 
66*cbc6b225SStefano Zampini   if (osingle) {
67*cbc6b225SStefano Zampini     ierr = PetscFree3(aij->a,aij->j,aij->i);CHKERRQ(ierr);
68*cbc6b225SStefano Zampini   } else {
69*cbc6b225SStefano Zampini     if (ofree_a)  { ierr = PetscFree(aij->a);CHKERRQ(ierr); }
70*cbc6b225SStefano Zampini     if (ofree_ij) { ierr = PetscFree(aij->j);CHKERRQ(ierr); }
71*cbc6b225SStefano Zampini     if (ofree_ij) { ierr = PetscFree(aij->i);CHKERRQ(ierr); }
72*cbc6b225SStefano Zampini   }
734222ddf1SHong Zhang   aij->i            = i;
744222ddf1SHong Zhang   aij->j            = j;
754222ddf1SHong Zhang   aij->a            = a;
764222ddf1SHong Zhang   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
77*cbc6b225SStefano Zampini   /* default to not retain ownership */
78*cbc6b225SStefano Zampini   aij->singlemalloc = PETSC_FALSE;
794222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
804222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
81*cbc6b225SStefano Zampini   ierr = MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6);CHKERRQ(ierr);
825c66b693SKris Buschelman   PetscFunctionReturn(0);
835c66b693SKris Buschelman }
841c24bd37SHong Zhang 
854222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
864222ddf1SHong Zhang {
874222ddf1SHong Zhang   PetscErrorCode      ierr;
884222ddf1SHong Zhang   Mat_Product         *product = C->product;
894222ddf1SHong Zhang   MatProductAlgorithm alg;
904222ddf1SHong Zhang   PetscBool           flg;
914222ddf1SHong Zhang 
924222ddf1SHong Zhang   PetscFunctionBegin;
934222ddf1SHong Zhang   if (product) {
944222ddf1SHong Zhang     alg = product->alg;
954222ddf1SHong Zhang   } else {
964222ddf1SHong Zhang     alg = "sorted";
974222ddf1SHong Zhang   }
984222ddf1SHong Zhang   /* sorted */
994222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
1004222ddf1SHong Zhang   if (flg) {
1014222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
1024222ddf1SHong Zhang     PetscFunctionReturn(0);
1034222ddf1SHong Zhang   }
1044222ddf1SHong Zhang 
1054222ddf1SHong Zhang   /* scalable */
1064222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
1074222ddf1SHong Zhang   if (flg) {
1084222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
1094222ddf1SHong Zhang     PetscFunctionReturn(0);
1104222ddf1SHong Zhang   }
1114222ddf1SHong Zhang 
1124222ddf1SHong Zhang   /* scalable_fast */
1134222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
1144222ddf1SHong Zhang   if (flg) {
1154222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
1164222ddf1SHong Zhang     PetscFunctionReturn(0);
1174222ddf1SHong Zhang   }
1184222ddf1SHong Zhang 
1194222ddf1SHong Zhang   /* heap */
1204222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
1214222ddf1SHong Zhang   if (flg) {
1224222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
1234222ddf1SHong Zhang     PetscFunctionReturn(0);
1244222ddf1SHong Zhang   }
1254222ddf1SHong Zhang 
1264222ddf1SHong Zhang   /* btheap */
1274222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
1284222ddf1SHong Zhang   if (flg) {
1294222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
1304222ddf1SHong Zhang     PetscFunctionReturn(0);
1314222ddf1SHong Zhang   }
1324222ddf1SHong Zhang 
1334222ddf1SHong Zhang   /* llcondensed */
1344222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
1354222ddf1SHong Zhang   if (flg) {
1364222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
1374222ddf1SHong Zhang     PetscFunctionReturn(0);
1384222ddf1SHong Zhang   }
1394222ddf1SHong Zhang 
1404222ddf1SHong Zhang   /* rowmerge */
1414222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
1424222ddf1SHong Zhang   if (flg) {
1434222ddf1SHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
1444222ddf1SHong Zhang     PetscFunctionReturn(0);
1454222ddf1SHong Zhang   }
1464222ddf1SHong Zhang 
1474222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1484222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
1494222ddf1SHong Zhang   if (flg) {
1504222ddf1SHong Zhang     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
1514222ddf1SHong Zhang     PetscFunctionReturn(0);
1524222ddf1SHong Zhang   }
1534222ddf1SHong Zhang #endif
1544222ddf1SHong Zhang 
1554222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1564222ddf1SHong Zhang }
1574222ddf1SHong Zhang 
1584222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
159b561aa9dSHong Zhang {
160b561aa9dSHong Zhang   PetscErrorCode     ierr;
161b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
162971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
163c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
164b561aa9dSHong Zhang   PetscReal          afill;
165eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
166eca6b86aSHong Zhang   PetscTable         ta;
167fb908031SHong Zhang   PetscBT            lnkbt;
1680298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
169b561aa9dSHong Zhang 
170b561aa9dSHong Zhang   PetscFunctionBegin;
171bd958071SHong Zhang   /* Get ci and cj */
172bd958071SHong Zhang   /*---------------*/
173fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
174fb908031SHong Zhang   /* free space for accumulating nonzero column info */
175854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
176fb908031SHong Zhang   ci[0] = 0;
177fb908031SHong Zhang 
178fb908031SHong Zhang   /* create and initialize a linked list */
179c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
180c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
181eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
182eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
183eca6b86aSHong Zhang 
184eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
185fb908031SHong Zhang 
186fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
187f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1882205254eSKarl Rupp 
189fb908031SHong Zhang   current_space = free_space;
190fb908031SHong Zhang 
191fb908031SHong Zhang   /* Determine ci and cj */
192fb908031SHong Zhang   for (i=0; i<am; i++) {
193fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
194fb908031SHong Zhang     aj   = a->j + ai[i];
195fb908031SHong Zhang     for (j=0; j<anzi; j++) {
196fb908031SHong Zhang       brow = aj[j];
197fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
198fb908031SHong Zhang       bj   = b->j + bi[brow];
199fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
200fb908031SHong Zhang       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
201fb908031SHong Zhang     }
2028c78258cSHong Zhang     /* add possible missing diagonal entry */
2038c78258cSHong Zhang     if (C->force_diagonals) {
2048c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr);
2058c78258cSHong Zhang     }
206fb908031SHong Zhang     cnzi = lnk[0];
207fb908031SHong Zhang 
208fb908031SHong Zhang     /* If free space is not available, make more free space */
209fb908031SHong Zhang     /* Double the amount of total space in the list */
210fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
211f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
212fb908031SHong Zhang       ndouble++;
213fb908031SHong Zhang     }
214fb908031SHong Zhang 
215fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
216fb908031SHong Zhang     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
2172205254eSKarl Rupp 
218fb908031SHong Zhang     current_space->array           += cnzi;
219fb908031SHong Zhang     current_space->local_used      += cnzi;
220fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2212205254eSKarl Rupp 
222fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
223fb908031SHong Zhang   }
224fb908031SHong Zhang 
225fb908031SHong Zhang   /* Column indices are in the list of free space */
226fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
227fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
228854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
229fb908031SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
230fb908031SHong Zhang   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
231b561aa9dSHong Zhang 
23226be0446SHong Zhang   /* put together the new symbolic matrix */
233e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
2344222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
23558c24d83SHong Zhang 
23658c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
23758c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2384222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
239aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
240e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
24158c24d83SHong Zhang   c->nonew   = 0;
2424222ddf1SHong Zhang 
2434222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2444222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2450b7e3e3dSHong Zhang 
2467212da7cSHong Zhang   /* set MatInfo */
2477212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
248f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2494222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2504222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2514222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2527212da7cSHong Zhang 
2537212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2547212da7cSHong Zhang   if (ci[am]) {
2557d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
2567d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
257f2b054eeSHong Zhang   } else {
2584222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
259be0fcf8dSHong Zhang   }
260f2b054eeSHong Zhang #endif
26158c24d83SHong Zhang   PetscFunctionReturn(0);
26258c24d83SHong Zhang }
263d50806bdSBarry Smith 
264df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
265d50806bdSBarry Smith {
266dfbe8321SBarry Smith   PetscErrorCode    ierr;
267d13dce4bSSatish Balay   PetscLogDouble    flops=0.0;
268d50806bdSBarry Smith   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
269d50806bdSBarry Smith   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
270d50806bdSBarry Smith   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
27138baddfdSBarry Smith   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
272c58ca2e3SHong Zhang   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
273519eb980SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
2742e5835c6SStefano Zampini   PetscScalar       *ca,valtmp;
275aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2766718818eSStefano Zampini   PetscContainer    cab_dense;
2772e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
278d50806bdSBarry Smith 
279d50806bdSBarry Smith   PetscFunctionBegin;
2802e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr);
2812e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr);
282aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
283854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
284aa1aec99SHong Zhang     c->a      = ca;
285aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2866718818eSStefano Zampini   } else ca = c->a;
2876718818eSStefano Zampini 
2886718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2896718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2906718818eSStefano Zampini      that is hard to eradicate) */
2916718818eSStefano Zampini   ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr);
2926718818eSStefano Zampini   if (!cab_dense) {
2936718818eSStefano Zampini     ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
2946718818eSStefano Zampini     ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr);
2956718818eSStefano Zampini     ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr);
2966718818eSStefano Zampini     ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr);
2976718818eSStefano Zampini     ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr);
2986718818eSStefano Zampini     ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr);
299d90d86d0SMatthew G. Knepley   }
3006718818eSStefano Zampini   ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr);
3016718818eSStefano Zampini   ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr);
302aa1aec99SHong Zhang 
303c124e916SHong Zhang   /* clean old values in C */
304580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
305d50806bdSBarry Smith   /* Traverse A row-wise. */
306d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
307d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
308d50806bdSBarry Smith   for (i=0; i<am; i++) {
309d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
310d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
311519eb980SHong Zhang       brow = aj[j];
312d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
313d50806bdSBarry Smith       bjj  = bj + bi[brow];
314d50806bdSBarry Smith       baj  = ba + bi[brow];
315519eb980SHong Zhang       /* perform dense axpy */
31636ec6d2dSHong Zhang       valtmp = aa[j];
317519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
31836ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
319519eb980SHong Zhang       }
320519eb980SHong Zhang       flops += 2*bnzi;
321519eb980SHong Zhang     }
322c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
323c58ca2e3SHong Zhang 
324c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
325519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
326519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
327519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
328519eb980SHong Zhang     }
329519eb980SHong Zhang     flops += cnzi;
330519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
331519eb980SHong Zhang   }
3322e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3332e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3342e5835c6SStefano Zampini #endif
335c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
336c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
337c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
3382e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr);
3392e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr);
340c58ca2e3SHong Zhang   PetscFunctionReturn(0);
341c58ca2e3SHong Zhang }
342c58ca2e3SHong Zhang 
34325023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
344c58ca2e3SHong Zhang {
345c58ca2e3SHong Zhang   PetscErrorCode    ierr;
346c58ca2e3SHong Zhang   PetscLogDouble    flops=0.0;
347c58ca2e3SHong Zhang   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
348c58ca2e3SHong Zhang   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
349c58ca2e3SHong Zhang   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
350c58ca2e3SHong Zhang   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
351c58ca2e3SHong Zhang   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
352c58ca2e3SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
3532e5835c6SStefano Zampini   PetscScalar       *ca=c->a,valtmp;
3542e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
355c58ca2e3SHong Zhang   PetscInt          nextb;
356c58ca2e3SHong Zhang 
357c58ca2e3SHong Zhang   PetscFunctionBegin;
3582e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr);
3592e5835c6SStefano Zampini   ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr);
360cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
361cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
362cf742fccSHong Zhang     c->a      = ca;
363cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
364cf742fccSHong Zhang   }
365cf742fccSHong Zhang 
366c58ca2e3SHong Zhang   /* clean old values in C */
367580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
368c58ca2e3SHong Zhang   /* Traverse A row-wise. */
369c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
370c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
371519eb980SHong Zhang   for (i=0; i<am; i++) {
372519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
373519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
374519eb980SHong Zhang     for (j=0; j<anzi; j++) {
375519eb980SHong Zhang       brow = aj[j];
376519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
377519eb980SHong Zhang       bjj  = bj + bi[brow];
378519eb980SHong Zhang       baj  = ba + bi[brow];
379519eb980SHong Zhang       /* perform sparse axpy */
38036ec6d2dSHong Zhang       valtmp = aa[j];
381c124e916SHong Zhang       nextb  = 0;
382c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
383c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
38436ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
385c124e916SHong Zhang         }
386d50806bdSBarry Smith       }
387d50806bdSBarry Smith       flops += 2*bnzi;
388d50806bdSBarry Smith     }
389519eb980SHong Zhang     aj += anzi; aa += anzi;
390519eb980SHong Zhang     cj += cnzi; ca += cnzi;
391d50806bdSBarry Smith   }
3922e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3932e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3942e5835c6SStefano Zampini #endif
395716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
396716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
397d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
3982e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr);
3992e5835c6SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr);
400d50806bdSBarry Smith   PetscFunctionReturn(0);
401d50806bdSBarry Smith }
402bc011b1eSHong Zhang 
4034222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
40425296bd5SBarry Smith {
40525296bd5SBarry Smith   PetscErrorCode     ierr;
40625296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
40725296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
4083c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
40925296bd5SBarry Smith   MatScalar          *ca;
41025296bd5SBarry Smith   PetscReal          afill;
411eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
412eca6b86aSHong Zhang   PetscTable         ta;
4130298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
41425296bd5SBarry Smith 
41525296bd5SBarry Smith   PetscFunctionBegin;
4163c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
4173c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
4183c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
419854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
4203c50cad2SHong Zhang   ci[0] = 0;
4213c50cad2SHong Zhang 
4223c50cad2SHong Zhang   /* create and initialize a linked list */
423c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
424c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
425eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
426eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
427eca6b86aSHong Zhang 
428eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
4293c50cad2SHong Zhang 
4303c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
431f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
4323c50cad2SHong Zhang   current_space = free_space;
4333c50cad2SHong Zhang 
4343c50cad2SHong Zhang   /* Determine ci and cj */
4353c50cad2SHong Zhang   for (i=0; i<am; i++) {
4363c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
4373c50cad2SHong Zhang     aj   = a->j + ai[i];
4383c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
4393c50cad2SHong Zhang       brow = aj[j];
4403c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
4413c50cad2SHong Zhang       bj   = b->j + bi[brow];
4423c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4433c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
4443c50cad2SHong Zhang     }
4458c78258cSHong Zhang     /* add possible missing diagonal entry */
4468c78258cSHong Zhang     if (C->force_diagonals) {
4478c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr);
4488c78258cSHong Zhang     }
4493c50cad2SHong Zhang     cnzi = lnk[1];
4503c50cad2SHong Zhang 
4513c50cad2SHong Zhang     /* If free space is not available, make more free space */
4523c50cad2SHong Zhang     /* Double the amount of total space in the list */
4533c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
454f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
4553c50cad2SHong Zhang       ndouble++;
4563c50cad2SHong Zhang     }
4573c50cad2SHong Zhang 
4583c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4593c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4602205254eSKarl Rupp 
4613c50cad2SHong Zhang     current_space->array           += cnzi;
4623c50cad2SHong Zhang     current_space->local_used      += cnzi;
4633c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4642205254eSKarl Rupp 
4653c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
4663c50cad2SHong Zhang   }
4673c50cad2SHong Zhang 
4683c50cad2SHong Zhang   /* Column indices are in the list of free space */
4693c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4703c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
471854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
4723c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
4733c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
47425296bd5SBarry Smith 
47525296bd5SBarry Smith   /* Allocate space for ca */
476580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
47725296bd5SBarry Smith 
47825296bd5SBarry Smith   /* put together the new symbolic matrix */
479e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
4804222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
48125296bd5SBarry Smith 
48225296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
48325296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4844222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
48525296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
48625296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
48725296bd5SBarry Smith   c->nonew   = 0;
4882205254eSKarl Rupp 
4894222ddf1SHong Zhang   /* slower, less memory */
4904222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
49125296bd5SBarry Smith 
49225296bd5SBarry Smith   /* set MatInfo */
49325296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
49425296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4954222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4964222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4974222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
49825296bd5SBarry Smith 
49925296bd5SBarry Smith #if defined(PETSC_USE_INFO)
50025296bd5SBarry Smith   if (ci[am]) {
5017d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
5027d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
50325296bd5SBarry Smith   } else {
5044222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
50525296bd5SBarry Smith   }
50625296bd5SBarry Smith #endif
50725296bd5SBarry Smith   PetscFunctionReturn(0);
50825296bd5SBarry Smith }
50925296bd5SBarry Smith 
5104222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
511e9e4536cSHong Zhang {
512e9e4536cSHong Zhang   PetscErrorCode     ierr;
513e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
514bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
51525c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
516e9e4536cSHong Zhang   MatScalar          *ca;
517e9e4536cSHong Zhang   PetscReal          afill;
518eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
519eca6b86aSHong Zhang   PetscTable         ta;
5200298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
521e9e4536cSHong Zhang 
522e9e4536cSHong Zhang   PetscFunctionBegin;
523bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
524bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
525bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
526854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
527bd958071SHong Zhang   ci[0] = 0;
528bd958071SHong Zhang 
529bd958071SHong Zhang   /* create and initialize a linked list */
530c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
531c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
532eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
533eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
534eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
535bd958071SHong Zhang 
536bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
537f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
538bd958071SHong Zhang   current_space = free_space;
539bd958071SHong Zhang 
540bd958071SHong Zhang   /* Determine ci and cj */
541bd958071SHong Zhang   for (i=0; i<am; i++) {
542bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
543bd958071SHong Zhang     aj   = a->j + ai[i];
544bd958071SHong Zhang     for (j=0; j<anzi; j++) {
545bd958071SHong Zhang       brow = aj[j];
546bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
547bd958071SHong Zhang       bj   = b->j + bi[brow];
548bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
549bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
550bd958071SHong Zhang     }
5518c78258cSHong Zhang     /* add possible missing diagonal entry */
5528c78258cSHong Zhang     if (C->force_diagonals) {
5538c78258cSHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr);
5548c78258cSHong Zhang     }
5558c78258cSHong Zhang 
556bd958071SHong Zhang     cnzi = lnk[0];
557bd958071SHong Zhang 
558bd958071SHong Zhang     /* If free space is not available, make more free space */
559bd958071SHong Zhang     /* Double the amount of total space in the list */
560bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
561f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
562bd958071SHong Zhang       ndouble++;
563bd958071SHong Zhang     }
564bd958071SHong Zhang 
565bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
566bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
5672205254eSKarl Rupp 
568bd958071SHong Zhang     current_space->array           += cnzi;
569bd958071SHong Zhang     current_space->local_used      += cnzi;
570bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5712205254eSKarl Rupp 
572bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
573bd958071SHong Zhang   }
574bd958071SHong Zhang 
575bd958071SHong Zhang   /* Column indices are in the list of free space */
576bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
577bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
578854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
579bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
580bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
581e9e4536cSHong Zhang 
582e9e4536cSHong Zhang   /* Allocate space for ca */
583bd958071SHong Zhang   /*-----------------------*/
584580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
585e9e4536cSHong Zhang 
586e9e4536cSHong Zhang   /* put together the new symbolic matrix */
587e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
5884222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
589e9e4536cSHong Zhang 
590e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
591e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5924222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
593e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
594e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
595e9e4536cSHong Zhang   c->nonew   = 0;
5962205254eSKarl Rupp 
5974222ddf1SHong Zhang   /* slower, less memory */
5984222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
599e9e4536cSHong Zhang 
600e9e4536cSHong Zhang   /* set MatInfo */
601e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
602e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
6034222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6044222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6054222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
606e9e4536cSHong Zhang 
607e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
608e9e4536cSHong Zhang   if (ci[am]) {
6097d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
6107d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
611e9e4536cSHong Zhang   } else {
6124222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
613e9e4536cSHong Zhang   }
614e9e4536cSHong Zhang #endif
615e9e4536cSHong Zhang   PetscFunctionReturn(0);
616e9e4536cSHong Zhang }
617e9e4536cSHong Zhang 
6184222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
6190ced3a2bSJed Brown {
6200ced3a2bSJed Brown   PetscErrorCode     ierr;
6210ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6220ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6230ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
6240ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6250ced3a2bSJed Brown   PetscReal          afill;
6260ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
6270298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6280ced3a2bSJed Brown   PetscHeap          h;
6290ced3a2bSJed Brown 
6300ced3a2bSJed Brown   PetscFunctionBegin;
631cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6320ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
6330ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
634854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6350ced3a2bSJed Brown   ci[0] = 0;
6360ced3a2bSJed Brown 
6370ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
638f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6390ced3a2bSJed Brown   current_space = free_space;
6400ced3a2bSJed Brown 
6410ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
642785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6430ced3a2bSJed Brown 
6440ced3a2bSJed Brown   /* Determine ci and cj */
6450ced3a2bSJed Brown   for (i=0; i<am; i++) {
6460ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6470ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6480ced3a2bSJed Brown     ci[i+1] = ci[i];
6490ced3a2bSJed Brown     /* Populate the min heap */
6500ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
6510ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
6520ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
6530ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
6540ced3a2bSJed Brown       }
6550ced3a2bSJed Brown     }
6560ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6570ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6580ced3a2bSJed Brown     while (j >= 0) {
6590ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
660f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6610ced3a2bSJed Brown         ndouble++;
6620ced3a2bSJed Brown       }
6630ced3a2bSJed Brown       *(current_space->array++) = col;
6640ced3a2bSJed Brown       current_space->local_used++;
6650ced3a2bSJed Brown       current_space->local_remaining--;
6660ced3a2bSJed Brown       ci[i+1]++;
6670ced3a2bSJed Brown 
6680ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6690ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
6700ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
6710ced3a2bSJed Brown         PetscInt j2,col2;
6720ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
6730ced3a2bSJed Brown         if (col2 != col) break;
6740ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
6750ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
6760ced3a2bSJed Brown       }
6770ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6780ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
6790ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6800ced3a2bSJed Brown     }
6810ced3a2bSJed Brown   }
6820ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6830ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
6840ced3a2bSJed Brown 
6850ced3a2bSJed Brown   /* Column indices are in the list of free space */
6860ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6870ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
688785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
6890ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
6900ced3a2bSJed Brown 
6910ced3a2bSJed Brown   /* put together the new symbolic matrix */
692e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
6934222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
6940ced3a2bSJed Brown 
6950ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6960ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6974222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
6980ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6990ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
7000ced3a2bSJed Brown   c->nonew   = 0;
70126fbe8dcSKarl Rupp 
7024222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7030ced3a2bSJed Brown 
7040ced3a2bSJed Brown   /* set MatInfo */
7050ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7060ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
7074222ddf1SHong Zhang   C->info.mallocs           = ndouble;
7084222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
7094222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
7100ced3a2bSJed Brown 
7110ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
7120ced3a2bSJed Brown   if (ci[am]) {
7137d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
7147d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7150ced3a2bSJed Brown   } else {
7164222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
7170ced3a2bSJed Brown   }
7180ced3a2bSJed Brown #endif
7190ced3a2bSJed Brown   PetscFunctionReturn(0);
7200ced3a2bSJed Brown }
721e9e4536cSHong Zhang 
7224222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
7238a07c6f1SJed Brown {
7248a07c6f1SJed Brown   PetscErrorCode     ierr;
7258a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
7268a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
7278a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
7288a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
7298a07c6f1SJed Brown   PetscReal          afill;
7308a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
7310298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
7328a07c6f1SJed Brown   PetscHeap          h;
7338a07c6f1SJed Brown   PetscBT            bt;
7348a07c6f1SJed Brown 
7358a07c6f1SJed Brown   PetscFunctionBegin;
736cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7378a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
7388a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
739854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
7408a07c6f1SJed Brown   ci[0] = 0;
7418a07c6f1SJed Brown 
7428a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
743f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
7442205254eSKarl Rupp 
7458a07c6f1SJed Brown   current_space = free_space;
7468a07c6f1SJed Brown 
7478a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
748785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
7498a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
7508a07c6f1SJed Brown 
7518a07c6f1SJed Brown   /* Determine ci and cj */
7528a07c6f1SJed Brown   for (i=0; i<am; i++) {
7538a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7548a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
7558a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7568a07c6f1SJed Brown     ci[i+1] = ci[i];
7578a07c6f1SJed Brown     /* Populate the min heap */
7588a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
7598a07c6f1SJed Brown       PetscInt brow = acol[j];
7608a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
7618a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7628a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7638a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7648a07c6f1SJed Brown           bb[j]++;
7658a07c6f1SJed Brown           break;
7668a07c6f1SJed Brown         }
7678a07c6f1SJed Brown       }
7688a07c6f1SJed Brown     }
7698a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7708a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7718a07c6f1SJed Brown     while (j >= 0) {
7728a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7730298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
774f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
7758a07c6f1SJed Brown         ndouble++;
7768a07c6f1SJed Brown       }
7778a07c6f1SJed Brown       *(current_space->array++) = col;
7788a07c6f1SJed Brown       current_space->local_used++;
7798a07c6f1SJed Brown       current_space->local_remaining--;
7808a07c6f1SJed Brown       ci[i+1]++;
7818a07c6f1SJed Brown 
7828a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7838a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
7848a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7858a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7868a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
7878a07c6f1SJed Brown           bb[j]++;
7888a07c6f1SJed Brown           break;
7898a07c6f1SJed Brown         }
7908a07c6f1SJed Brown       }
7918a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7928a07c6f1SJed Brown     }
7938a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7948a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7958a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7968a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7978a07c6f1SJed Brown     }
7988a07c6f1SJed Brown   }
7998a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
8008a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
8018a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
8028a07c6f1SJed Brown 
8038a07c6f1SJed Brown   /* Column indices are in the list of free space */
8048a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
8058a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
806785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
8078a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
8088a07c6f1SJed Brown 
8098a07c6f1SJed Brown   /* put together the new symbolic matrix */
810e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
8114222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
8128a07c6f1SJed Brown 
8138a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
8148a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
8154222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
8168a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
8178a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
8188a07c6f1SJed Brown   c->nonew   = 0;
81926fbe8dcSKarl Rupp 
8204222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
8218a07c6f1SJed Brown 
8228a07c6f1SJed Brown   /* set MatInfo */
8238a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
8248a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
8254222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8264222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8274222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8288a07c6f1SJed Brown 
8298a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8308a07c6f1SJed Brown   if (ci[am]) {
8317d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
8327d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
8338a07c6f1SJed Brown   } else {
8344222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
8358a07c6f1SJed Brown   }
8368a07c6f1SJed Brown #endif
8378a07c6f1SJed Brown   PetscFunctionReturn(0);
8388a07c6f1SJed Brown }
8398a07c6f1SJed Brown 
8404222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
841d7ed1a4dSandi selinger {
842d7ed1a4dSandi selinger   PetscErrorCode     ierr;
843d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
844d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
845d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
846d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
847d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
848d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
849d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
850d7ed1a4dSandi selinger   PetscInt           window[8];
851d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
852d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
853d7ed1a4dSandi selinger   PetscReal          afill;
854f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
8557660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
856d7ed1a4dSandi selinger 
857d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
858d7ed1a4dSandi selinger              Because of the way virtual memory works,
859d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
860d7ed1a4dSandi selinger   PetscFunctionBegin;
861d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
862d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
863d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
864d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
865d7ed1a4dSandi selinger     a_rownnz = 0;
866d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
867d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
868d7ed1a4dSandi selinger       if (a_rownnz > bn) {
869d7ed1a4dSandi selinger         a_rownnz = bn;
870d7ed1a4dSandi selinger         break;
871d7ed1a4dSandi selinger       }
872d7ed1a4dSandi selinger     }
873d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
874d7ed1a4dSandi selinger   }
875d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
876d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
877f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
878f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
879d7ed1a4dSandi selinger 
8807660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8817660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
882d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
883d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
884d7ed1a4dSandi selinger 
885d7ed1a4dSandi selinger   ci_nnz       = 0;
886d7ed1a4dSandi selinger   ci[0]        = 0;
887d7ed1a4dSandi selinger   worki_L1[0]  = 0;
888d7ed1a4dSandi selinger   worki_L2[0]  = 0;
889d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
890d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
891d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
892d7ed1a4dSandi selinger     rowsleft             = anzi;
893d7ed1a4dSandi selinger     inputcol_L1          = acol;
894d7ed1a4dSandi selinger     L2_nnz               = 0;
8957660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8967660c4dbSandi selinger     worki_L2[1]          = 0;
89708fe1b3cSKarl Rupp     outputi_nnz          = 0;
898d7ed1a4dSandi selinger 
899d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
900d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
901d7ed1a4dSandi selinger       c_maxmem *= 2;
902d7ed1a4dSandi selinger       ndouble++;
903d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
904d7ed1a4dSandi selinger     }
905d7ed1a4dSandi selinger 
906d7ed1a4dSandi selinger     while (rowsleft) {
907d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
908d7ed1a4dSandi selinger       L1_nrows    = 0;
9097660c4dbSandi selinger       L1_nnz      = 0;
910d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
911d7ed1a4dSandi selinger       inputi      = bi;
912d7ed1a4dSandi selinger       inputj      = bj;
913d7ed1a4dSandi selinger 
914d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
915d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
916f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
917d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
918d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
919d7ed1a4dSandi selinger          window_min  = bn;                                                   \
9207660c4dbSandi selinger          outputi_nnz = 0;                                                    \
9217660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
922d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
923d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
924d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
925d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
926d7ed1a4dSandi selinger          }                                                                   \
927d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
928d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
929d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
930d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
931d7ed1a4dSandi selinger            window_min = bn;                                                  \
932d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
933d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
934d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
935d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
936d7ed1a4dSandi selinger              }                                                               \
937d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
938d7ed1a4dSandi selinger            }                                                                 \
939d7ed1a4dSandi selinger          }
940d7ed1a4dSandi selinger 
941d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
942d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
943d7ed1a4dSandi selinger       while (L1_rowsleft) {
9447660c4dbSandi selinger         outputi_nnz = 0;
9457660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
9467660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
9477660c4dbSandi selinger 
948d7ed1a4dSandi selinger         switch (L1_rowsleft) {
949d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
950d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
951d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
952d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
953d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
954d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
955d7ed1a4dSandi selinger                  break;
956d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
957d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
958d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
959d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
960d7ed1a4dSandi selinger                  break;
961d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
962d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
963d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
964d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
965d7ed1a4dSandi selinger                  break;
966d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
967d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
968d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
969d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
970d7ed1a4dSandi selinger                  break;
971d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
972d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
973d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
974d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
975d7ed1a4dSandi selinger                  break;
976d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
977d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
978d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
979d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
980d7ed1a4dSandi selinger                  break;
981d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
982d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
983d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
984d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
985d7ed1a4dSandi selinger                  break;
986d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
987d7ed1a4dSandi selinger                  inputcol    += 8;
988d7ed1a4dSandi selinger                  rowsleft    -= 8;
989d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
990d7ed1a4dSandi selinger                  break;
991d7ed1a4dSandi selinger         }
992d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9937660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9947660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
995d7ed1a4dSandi selinger       }
996d7ed1a4dSandi selinger 
997d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
998d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
999d7ed1a4dSandi selinger       if (anzi > 8) {
1000d7ed1a4dSandi selinger         inputi      = worki_L1;
1001d7ed1a4dSandi selinger         inputj      = workj_L1;
1002d7ed1a4dSandi selinger         inputcol    = workcol;
1003d7ed1a4dSandi selinger         outputi_nnz = 0;
1004d7ed1a4dSandi selinger 
1005d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
1006d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
1007d7ed1a4dSandi selinger 
1008d7ed1a4dSandi selinger         switch (L1_nrows) {
1009d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1010d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
1011d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1012d7ed1a4dSandi selinger                  break;
1013d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1014d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1015d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1016d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1017d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1018d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1019d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1020d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1021d7ed1a4dSandi selinger         }
1022d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
1023d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
1024d7ed1a4dSandi selinger 
10257660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10267660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1027d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1028d7ed1a4dSandi selinger           inputi      = worki_L2;
1029d7ed1a4dSandi selinger           inputj      = workj_L2;
1030d7ed1a4dSandi selinger           inputcol    = workcol;
1031d7ed1a4dSandi selinger           outputi_nnz = 0;
1032f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1033d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
1034d7ed1a4dSandi selinger           switch (L2_nrows) {
1035d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1036d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1037d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1038d7ed1a4dSandi selinger                    break;
1039d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1040d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1041d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1042d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1043d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1044d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1045d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1046d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1047d7ed1a4dSandi selinger           }
1048d7ed1a4dSandi selinger           L2_nrows    = 1;
10497660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10507660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10517660c4dbSandi selinger           /* Copy to workj_L2 */
1052d7ed1a4dSandi selinger           if (rowsleft) {
10537660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1054d7ed1a4dSandi selinger           }
1055d7ed1a4dSandi selinger         }
1056d7ed1a4dSandi selinger       }
1057d7ed1a4dSandi selinger     }  /* while (rowsleft) */
1058d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1059d7ed1a4dSandi selinger 
1060d7ed1a4dSandi selinger     /* terminate current row */
1061d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1062d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
1063d7ed1a4dSandi selinger   }
1064d7ed1a4dSandi selinger 
1065d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
1066e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
10674222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1068d7ed1a4dSandi selinger 
1069d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1070d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10714222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1072d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1073d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1074d7ed1a4dSandi selinger   c->nonew   = 0;
1075d7ed1a4dSandi selinger 
10764222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1077d7ed1a4dSandi selinger 
1078d7ed1a4dSandi selinger   /* set MatInfo */
1079d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1080d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10814222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10824222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10834222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1084d7ed1a4dSandi selinger 
1085d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1086d7ed1a4dSandi selinger   if (ci[am]) {
10877d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
10887d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1089d7ed1a4dSandi selinger   } else {
10904222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1091d7ed1a4dSandi selinger   }
1092d7ed1a4dSandi selinger #endif
1093d7ed1a4dSandi selinger 
1094d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1095d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1096d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1097f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1098d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1099d7ed1a4dSandi selinger }
1100d7ed1a4dSandi selinger 
1101cd093f1dSJed Brown /* concatenate unique entries and then sort */
11024222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1103cd093f1dSJed Brown {
1104cd093f1dSJed Brown   PetscErrorCode ierr;
1105cd093f1dSJed Brown   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1106cd093f1dSJed Brown   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
11078c78258cSHong Zhang   PetscInt       *ci,*cj,bcol;
1108cd093f1dSJed Brown   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1109cd093f1dSJed Brown   PetscReal      afill;
1110cd093f1dSJed Brown   PetscInt       i,j,ndouble = 0;
1111cd093f1dSJed Brown   PetscSegBuffer seg,segrow;
1112cd093f1dSJed Brown   char           *seen;
1113cd093f1dSJed Brown 
1114cd093f1dSJed Brown   PetscFunctionBegin;
1115854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1116cd093f1dSJed Brown   ci[0] = 0;
1117cd093f1dSJed Brown 
1118cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1119cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1120cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1121580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1122cd093f1dSJed Brown 
1123cd093f1dSJed Brown   /* Determine ci and cj */
1124cd093f1dSJed Brown   for (i=0; i<am; i++) {
1125cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1126cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1127cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
11288c78258cSHong Zhang 
1129cd093f1dSJed Brown     /* Pack segrow */
1130cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1131cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1132cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
11338c78258cSHong Zhang         bcol = bj[k];
1134cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1135cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1136cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1137cd093f1dSJed Brown           *slot = bcol;
1138cd093f1dSJed Brown           seen[bcol] = 1;
1139cd093f1dSJed Brown           packlen++;
1140cd093f1dSJed Brown         }
1141cd093f1dSJed Brown       }
1142cd093f1dSJed Brown     }
11438c78258cSHong Zhang 
11448c78258cSHong Zhang     /* Check i-th diagonal entry */
11458c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11468c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11478c78258cSHong Zhang       ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
11488c78258cSHong Zhang       *slot   = i;
11498c78258cSHong Zhang       seen[i] = 1;
11508c78258cSHong Zhang       packlen++;
11518c78258cSHong Zhang     }
11528c78258cSHong Zhang 
1153cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1154cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1155cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1156cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1157cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1158cd093f1dSJed Brown   }
1159cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1160cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1161cd093f1dSJed Brown 
1162cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1163cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1164cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1165cd093f1dSJed Brown 
1166cd093f1dSJed Brown   /* put together the new symbolic matrix */
1167e4e71118SStefano Zampini   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
11684222ddf1SHong Zhang   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1169cd093f1dSJed Brown 
1170cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1171cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11724222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1173cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1174cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1175cd093f1dSJed Brown   c->nonew   = 0;
1176cd093f1dSJed Brown 
11774222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1178cd093f1dSJed Brown 
1179cd093f1dSJed Brown   /* set MatInfo */
11802a09556fSStefano Zampini   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1181cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11824222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11834222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11844222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1185cd093f1dSJed Brown 
1186cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1187cd093f1dSJed Brown   if (ci[am]) {
11887d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
11897d3de750SJacob Faibussowitsch     ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1190cd093f1dSJed Brown   } else {
11914222ddf1SHong Zhang     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1192cd093f1dSJed Brown   }
1193cd093f1dSJed Brown #endif
1194cd093f1dSJed Brown   PetscFunctionReturn(0);
1195cd093f1dSJed Brown }
1196cd093f1dSJed Brown 
11976718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
11982128a86cSHong Zhang {
11992128a86cSHong Zhang   PetscErrorCode      ierr;
12006718818eSStefano Zampini   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
12012128a86cSHong Zhang 
12022128a86cSHong Zhang   PetscFunctionBegin;
120340192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
120440192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
120540192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
120640192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
12072128a86cSHong Zhang   PetscFunctionReturn(0);
12082128a86cSHong Zhang }
12092128a86cSHong Zhang 
12104222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1211bc011b1eSHong Zhang {
12125df89d91SHong Zhang   PetscErrorCode      ierr;
121381d82fe4SHong Zhang   Mat                 Bt;
121481d82fe4SHong Zhang   PetscInt            *bti,*btj;
121540192850SHong Zhang   Mat_MatMatTransMult *abt;
12164222ddf1SHong Zhang   Mat_Product         *product = C->product;
12176718818eSStefano Zampini   char                *alg;
1218d2853540SHong Zhang 
121981d82fe4SHong Zhang   PetscFunctionBegin;
12202c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
12212c71b3e2SJacob Faibussowitsch   PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
12226718818eSStefano Zampini 
122381d82fe4SHong Zhang   /* create symbolic Bt */
122481d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12250298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
122633d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
122704b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
122881d82fe4SHong Zhang 
122981d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12306718818eSStefano Zampini   ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr);
12314222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
123281d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
12334222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
12346718818eSStefano Zampini   ierr = PetscFree(alg);CHKERRQ(ierr);
123581d82fe4SHong Zhang 
1236a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1237b00a9115SJed Brown   ierr = PetscNew(&abt);CHKERRQ(ierr);
12382128a86cSHong Zhang 
12396718818eSStefano Zampini   product->data    = abt;
12406718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12416718818eSStefano Zampini 
12424222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12432128a86cSHong Zhang 
12444222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12454222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
124640192850SHong Zhang   if (abt->usecoloring) {
1247b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1248b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1249335efc43SPeter Brune     MatColoring          coloring;
12502128a86cSHong Zhang     ISColoring           iscoloring;
12512128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
1252e8354b3eSHong Zhang 
12534222ddf1SHong Zhang     /* inode causes memory problem */
12544222ddf1SHong Zhang     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
12554222ddf1SHong Zhang 
12564222ddf1SHong Zhang     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1257335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1258335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1259335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1260335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1261335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
12624222ddf1SHong Zhang     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
12632205254eSKarl Rupp 
126440192850SHong Zhang     abt->matcoloring = matcoloring;
12652205254eSKarl Rupp 
12662128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
12672128a86cSHong Zhang 
12682128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12692128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
12702128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12712128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
12720298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
12732205254eSKarl Rupp 
12742128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
127540192850SHong Zhang     abt->Bt_den         = Bt_dense;
12762128a86cSHong Zhang 
12772128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
12782128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
12792128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
12800298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
12812205254eSKarl Rupp 
12822128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
128340192850SHong Zhang     abt->ABt_den  = C_dense;
1284f94ccd6cSHong Zhang 
1285f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12861ce71dffSSatish Balay     {
12874222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
12887d3de750SJacob Faibussowitsch       ierr = PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors))));CHKERRQ(ierr);
12891ce71dffSSatish Balay     }
1290f94ccd6cSHong Zhang #endif
12912128a86cSHong Zhang   }
1292e99be685SHong Zhang   /* clean up */
1293e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1294e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12955df89d91SHong Zhang   PetscFunctionReturn(0);
12965df89d91SHong Zhang }
12975df89d91SHong Zhang 
12986fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12995df89d91SHong Zhang {
13005df89d91SHong Zhang   PetscErrorCode      ierr;
13015df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1302e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
130381d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
13045df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1305aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
13066718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13076718818eSStefano Zampini   Mat_Product         *product = C->product;
13085df89d91SHong Zhang 
13095df89d91SHong Zhang   PetscFunctionBegin;
13102c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
13116718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
13122c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
131358ed3ceaSHong Zhang   /* clear old values in C */
131458ed3ceaSHong Zhang   if (!c->a) {
1315580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
131658ed3ceaSHong Zhang     c->a      = ca;
131758ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
131858ed3ceaSHong Zhang   } else {
131958ed3ceaSHong Zhang     ca =  c->a;
1320580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
132158ed3ceaSHong Zhang   }
132258ed3ceaSHong Zhang 
132340192850SHong Zhang   if (abt->usecoloring) {
132440192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
132540192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1326c8db22f6SHong Zhang 
1327b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
132840192850SHong Zhang     Bt_dense = abt->Bt_den;
1329b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1330c8db22f6SHong Zhang 
1331c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13322128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1333c8db22f6SHong Zhang 
13342128a86cSHong Zhang     /* Recover C from C_dense */
1335b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1336c8db22f6SHong Zhang     PetscFunctionReturn(0);
1337c8db22f6SHong Zhang   }
1338c8db22f6SHong Zhang 
13391fa1209cSHong Zhang   for (i=0; i<cm; i++) {
134081d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
134181d82fe4SHong Zhang     acol = aj + ai[i];
13426973769bSHong Zhang     aval = aa + ai[i];
13431fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
13441fa1209cSHong Zhang     ccol = cj + ci[i];
13456973769bSHong Zhang     cval = ca + ci[i];
13461fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
134781d82fe4SHong Zhang       brow = ccol[j];
134881d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
134981d82fe4SHong Zhang       bcol = bj + bi[brow];
13506973769bSHong Zhang       bval = ba + bi[brow];
13516973769bSHong Zhang 
135281d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
135381d82fe4SHong Zhang       nexta = 0; nextb = 0;
135481d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
13557b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
135681d82fe4SHong Zhang         if (nexta == anzi) break;
13577b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
135881d82fe4SHong Zhang         if (nextb == bnzj) break;
135981d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13606973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
136181d82fe4SHong Zhang           nexta++; nextb++;
136281d82fe4SHong Zhang           flops += 2;
13631fa1209cSHong Zhang         }
13641fa1209cSHong Zhang       }
136581d82fe4SHong Zhang     }
136681d82fe4SHong Zhang   }
136781d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
136881d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
136981d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
13705df89d91SHong Zhang   PetscFunctionReturn(0);
13715df89d91SHong Zhang }
13725df89d91SHong Zhang 
13736718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
13746d373c3eSHong Zhang {
13756d373c3eSHong Zhang   PetscErrorCode      ierr;
13766718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
13776d373c3eSHong Zhang 
13786d373c3eSHong Zhang   PetscFunctionBegin;
13796d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
13806718818eSStefano Zampini   if (atb->destroy) {
13816718818eSStefano Zampini     ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr);
13826473ade5SStefano Zampini   }
13836d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
13846d373c3eSHong Zhang   PetscFunctionReturn(0);
13856d373c3eSHong Zhang }
13866d373c3eSHong Zhang 
13874222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
13885df89d91SHong Zhang {
1389bc011b1eSHong Zhang   PetscErrorCode ierr;
1390089a957eSStefano Zampini   Mat            At = NULL;
139138baddfdSBarry Smith   PetscInt       *ati,*atj;
13924222ddf1SHong Zhang   Mat_Product    *product = C->product;
1393089a957eSStefano Zampini   PetscBool      flg,def,square;
1394bc011b1eSHong Zhang 
1395bc011b1eSHong Zhang   PetscFunctionBegin;
1396089a957eSStefano Zampini   MatCheckProduct(C,4);
1397089a957eSStefano Zampini   square = (PetscBool)(A == B && A->symmetric && A->symmetric_set);
13984222ddf1SHong Zhang   /* outerproduct */
1399089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr);
14004222ddf1SHong Zhang   if (flg) {
1401bc011b1eSHong Zhang     /* create symbolic At */
1402089a957eSStefano Zampini     if (!square) {
1403bc011b1eSHong Zhang       ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
14040298fd71SBarry Smith       ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
140533d57670SJed Brown       ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
140604b858e0SBarry Smith       ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1407089a957eSStefano Zampini     }
1408bc011b1eSHong Zhang     /* get symbolic C=At*B */
14097a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1410089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
1411bc011b1eSHong Zhang 
1412bc011b1eSHong Zhang     /* clean up */
1413089a957eSStefano Zampini     if (!square) {
14146bf464f9SBarry Smith       ierr = MatDestroy(&At);CHKERRQ(ierr);
1415bc011b1eSHong Zhang       ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1416089a957eSStefano Zampini     }
14176d373c3eSHong Zhang 
14184222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14197a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr);
14204222ddf1SHong Zhang     PetscFunctionReturn(0);
14214222ddf1SHong Zhang   }
14224222ddf1SHong Zhang 
14234222ddf1SHong Zhang   /* matmatmult */
1424089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr);
1425089a957eSStefano Zampini   ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr);
1426089a957eSStefano Zampini   if (flg || def) {
14274222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14284222ddf1SHong Zhang 
14292c71b3e2SJacob Faibussowitsch     PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
14304222ddf1SHong Zhang     ierr = PetscNew(&atb);CHKERRQ(ierr);
1431089a957eSStefano Zampini     if (!square) {
14324222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1433089a957eSStefano Zampini     }
14347a3c3d58SStefano Zampini     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1435089a957eSStefano Zampini     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr);
14366718818eSStefano Zampini     ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr);
14376718818eSStefano Zampini     product->data    = atb;
14386718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14394222ddf1SHong Zhang     atb->At          = At;
14404222ddf1SHong Zhang     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
14414222ddf1SHong Zhang 
14424222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14434222ddf1SHong Zhang     PetscFunctionReturn(0);
14444222ddf1SHong Zhang   }
14454222ddf1SHong Zhang 
14464222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1447bc011b1eSHong Zhang }
1448bc011b1eSHong Zhang 
144975648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1450bc011b1eSHong Zhang {
1451bc011b1eSHong Zhang   PetscErrorCode ierr;
14520fbc74f4SHong Zhang   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1453d0f46423SBarry Smith   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1454d0f46423SBarry Smith   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1455d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1456aa1aec99SHong Zhang   MatScalar      *aa=a->a,*ba,*ca,*caj;
1457bc011b1eSHong Zhang 
1458bc011b1eSHong Zhang   PetscFunctionBegin;
1459aa1aec99SHong Zhang   if (!c->a) {
1460580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
14612205254eSKarl Rupp 
1462aa1aec99SHong Zhang     c->a      = ca;
1463aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1464aa1aec99SHong Zhang   } else {
1465aa1aec99SHong Zhang     ca   = c->a;
1466580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1467aa1aec99SHong Zhang   }
1468bc011b1eSHong Zhang 
1469bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1470bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1471bc011b1eSHong Zhang     bj   = b->j + bi[i];
1472bc011b1eSHong Zhang     ba   = b->a + bi[i];
1473bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1474bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1475bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1476bc011b1eSHong Zhang       nextb = 0;
14770fbc74f4SHong Zhang       crow  = *aj++;
1478bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1479bc011b1eSHong Zhang       caj   = ca + ci[crow];
1480bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1481bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14820fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14830fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1484bc011b1eSHong Zhang           nextb++;
1485bc011b1eSHong Zhang         }
1486bc011b1eSHong Zhang       }
1487bc011b1eSHong Zhang       flops += 2*bnzi;
14880fbc74f4SHong Zhang       aa++;
1489bc011b1eSHong Zhang     }
1490bc011b1eSHong Zhang   }
1491bc011b1eSHong Zhang 
1492bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1493bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1494bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1495bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1496bc011b1eSHong Zhang   PetscFunctionReturn(0);
1497bc011b1eSHong Zhang }
14989123193aSHong Zhang 
14994222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
15009123193aSHong Zhang {
15019123193aSHong Zhang   PetscErrorCode ierr;
15029123193aSHong Zhang 
15039123193aSHong Zhang   PetscFunctionBegin;
15045a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
15054222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
15069123193aSHong Zhang   PetscFunctionReturn(0);
15079123193aSHong Zhang }
15089123193aSHong Zhang 
150993aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
15109123193aSHong Zhang {
1511f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1512612438f1SStefano Zampini   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
15131ca9667aSStefano Zampini   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
15149123193aSHong Zhang   PetscErrorCode    ierr;
15151ca9667aSStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1516a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1517daf57211SHong Zhang   const PetscInt    *aj;
1518612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
15191ca9667aSStefano Zampini   PetscInt          clda=cd->lda;
15201ca9667aSStefano Zampini   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
15219123193aSHong Zhang 
15229123193aSHong Zhang   PetscFunctionBegin;
1523f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1524a4af7ca8SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
152593aa15f2SStefano Zampini   if (add) {
15268c778c55SBarry Smith     ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
152793aa15f2SStefano Zampini   } else {
152893aa15f2SStefano Zampini     ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr);
152993aa15f2SStefano Zampini   }
1530a4af7ca8SStefano Zampini   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1531f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
15321ca9667aSStefano Zampini   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
15331ca9667aSStefano Zampini   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
15341ca9667aSStefano Zampini     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1535f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1536f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1537f32d5d43SBarry Smith       aj = a->j + a->i[i];
1538a4af7ca8SStefano Zampini       aa = av + a->i[i];
1539f32d5d43SBarry Smith       for (j=0; j<n; j++) {
15401ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15411ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1542730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1543730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1544730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1545730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
15469123193aSHong Zhang       }
154793aa15f2SStefano Zampini       if (add) {
154887753ddeSHong Zhang         c1[i] += r1;
154987753ddeSHong Zhang         c2[i] += r2;
155087753ddeSHong Zhang         c3[i] += r3;
155187753ddeSHong Zhang         c4[i] += r4;
155293aa15f2SStefano Zampini       } else {
155393aa15f2SStefano Zampini         c1[i] = r1;
155493aa15f2SStefano Zampini         c2[i] = r2;
155593aa15f2SStefano Zampini         c3[i] = r3;
155693aa15f2SStefano Zampini         c4[i] = r4;
155793aa15f2SStefano Zampini       }
1558f32d5d43SBarry Smith     }
1559730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1560730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1561f32d5d43SBarry Smith   }
156293aa15f2SStefano Zampini   /* process remaining columns */
156393aa15f2SStefano Zampini   if (col != cn) {
156493aa15f2SStefano Zampini     PetscInt rc = cn-col;
156593aa15f2SStefano Zampini 
156693aa15f2SStefano Zampini     if (rc == 1) {
156793aa15f2SStefano Zampini       for (i=0; i<am; i++) {
1568f32d5d43SBarry Smith         r1 = 0.0;
1569f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1570f32d5d43SBarry Smith         aj = a->j + a->i[i];
1571a4af7ca8SStefano Zampini         aa = av + a->i[i];
157293aa15f2SStefano Zampini         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
157393aa15f2SStefano Zampini         if (add) c1[i] += r1;
157493aa15f2SStefano Zampini         else c1[i] = r1;
157593aa15f2SStefano Zampini       }
157693aa15f2SStefano Zampini     } else if (rc == 2) {
157793aa15f2SStefano Zampini       for (i=0; i<am; i++) {
157893aa15f2SStefano Zampini         r1 = r2 = 0.0;
157993aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
158093aa15f2SStefano Zampini         aj = a->j + a->i[i];
158193aa15f2SStefano Zampini         aa = av + a->i[i];
1582f32d5d43SBarry Smith         for (j=0; j<n; j++) {
158393aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
158493aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
158593aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
158693aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
1587f32d5d43SBarry Smith         }
158893aa15f2SStefano Zampini         if (add) {
158987753ddeSHong Zhang           c1[i] += r1;
159093aa15f2SStefano Zampini           c2[i] += r2;
159193aa15f2SStefano Zampini         } else {
159293aa15f2SStefano Zampini           c1[i] = r1;
159393aa15f2SStefano Zampini           c2[i] = r2;
1594f32d5d43SBarry Smith         }
159593aa15f2SStefano Zampini       }
159693aa15f2SStefano Zampini     } else {
159793aa15f2SStefano Zampini       for (i=0; i<am; i++) {
159893aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
159993aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
160093aa15f2SStefano Zampini         aj = a->j + a->i[i];
160193aa15f2SStefano Zampini         aa = av + a->i[i];
160293aa15f2SStefano Zampini         for (j=0; j<n; j++) {
160393aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
160493aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
160593aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
160693aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
160793aa15f2SStefano Zampini           r3 += aatmp*b3[ajtmp];
160893aa15f2SStefano Zampini         }
160993aa15f2SStefano Zampini         if (add) {
161093aa15f2SStefano Zampini           c1[i] += r1;
161193aa15f2SStefano Zampini           c2[i] += r2;
161293aa15f2SStefano Zampini           c3[i] += r3;
161393aa15f2SStefano Zampini         } else {
161493aa15f2SStefano Zampini           c1[i] = r1;
161593aa15f2SStefano Zampini           c2[i] = r2;
161693aa15f2SStefano Zampini           c3[i] = r3;
161793aa15f2SStefano Zampini         }
161893aa15f2SStefano Zampini       }
161993aa15f2SStefano Zampini     }
1620f32d5d43SBarry Smith   }
1621dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
162293aa15f2SStefano Zampini   if (add) {
16238c778c55SBarry Smith     ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
162493aa15f2SStefano Zampini   } else {
162593aa15f2SStefano Zampini     ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr);
162693aa15f2SStefano Zampini   }
1627a4af7ca8SStefano Zampini   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1628a4af7ca8SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1629f32d5d43SBarry Smith   PetscFunctionReturn(0);
1630f32d5d43SBarry Smith }
1631f32d5d43SBarry Smith 
163287753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1633f32d5d43SBarry Smith {
1634f32d5d43SBarry Smith   PetscErrorCode ierr;
1635f32d5d43SBarry Smith 
1636f32d5d43SBarry Smith   PetscFunctionBegin;
16372c71b3e2SJacob Faibussowitsch   PetscCheckFalse(B->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
16382c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->rmap->n != C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
16392c71b3e2SJacob Faibussowitsch   PetscCheckFalse(B->cmap->n != C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
16404324174eSBarry Smith 
164193aa15f2SStefano Zampini   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr);
16429123193aSHong Zhang   PetscFunctionReturn(0);
16439123193aSHong Zhang }
1644b1683b59SHong Zhang 
16454222ddf1SHong Zhang /* ------------------------------------------------------- */
16464222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
16474222ddf1SHong Zhang {
16484222ddf1SHong Zhang   PetscFunctionBegin;
16494222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16504222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16514222ddf1SHong Zhang   PetscFunctionReturn(0);
16524222ddf1SHong Zhang }
16534222ddf1SHong Zhang 
16546718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
16556718818eSStefano Zampini 
16564222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
16574222ddf1SHong Zhang {
16584222ddf1SHong Zhang   PetscFunctionBegin;
16596718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16604222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16616718818eSStefano Zampini   PetscFunctionReturn(0);
16626718818eSStefano Zampini }
16636718818eSStefano Zampini 
16646718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
16656718818eSStefano Zampini {
16666718818eSStefano Zampini   PetscFunctionBegin;
16676718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16686718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16694222ddf1SHong Zhang   PetscFunctionReturn(0);
16704222ddf1SHong Zhang }
16714222ddf1SHong Zhang 
16724222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
16734222ddf1SHong Zhang {
16744222ddf1SHong Zhang   PetscErrorCode ierr;
16754222ddf1SHong Zhang   Mat_Product    *product = C->product;
16764222ddf1SHong Zhang 
16774222ddf1SHong Zhang   PetscFunctionBegin;
16784222ddf1SHong Zhang   switch (product->type) {
16794222ddf1SHong Zhang   case MATPRODUCT_AB:
16804222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
16814222ddf1SHong Zhang     break;
16824222ddf1SHong Zhang   case MATPRODUCT_AtB:
16834222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
16844222ddf1SHong Zhang     break;
16856718818eSStefano Zampini   case MATPRODUCT_ABt:
16866718818eSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr);
16874222ddf1SHong Zhang     break;
16884222ddf1SHong Zhang   default:
16896718818eSStefano Zampini     break;
16904222ddf1SHong Zhang   }
16914222ddf1SHong Zhang   PetscFunctionReturn(0);
16924222ddf1SHong Zhang }
16934222ddf1SHong Zhang /* ------------------------------------------------------- */
16944222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
16954222ddf1SHong Zhang {
16964222ddf1SHong Zhang   PetscErrorCode ierr;
16974222ddf1SHong Zhang   Mat_Product    *product = C->product;
16984222ddf1SHong Zhang   Mat            A = product->A;
16994222ddf1SHong Zhang   PetscBool      baij;
17004222ddf1SHong Zhang 
17014222ddf1SHong Zhang   PetscFunctionBegin;
17024222ddf1SHong Zhang   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
17034222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
17044222ddf1SHong Zhang     PetscBool sbaij;
17054222ddf1SHong Zhang     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
17062c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
17074222ddf1SHong Zhang 
17084222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17094222ddf1SHong Zhang   } else { /* A is seqbaij */
17104222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17114222ddf1SHong Zhang   }
17124222ddf1SHong Zhang 
17134222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17144222ddf1SHong Zhang   PetscFunctionReturn(0);
17154222ddf1SHong Zhang }
17164222ddf1SHong Zhang 
17174222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
17184222ddf1SHong Zhang {
17194222ddf1SHong Zhang   PetscErrorCode ierr;
17204222ddf1SHong Zhang   Mat_Product    *product = C->product;
17214222ddf1SHong Zhang 
17224222ddf1SHong Zhang   PetscFunctionBegin;
17236718818eSStefano Zampini   MatCheckProduct(C,1);
17242c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
17256718818eSStefano Zampini   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
17264222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
17276718818eSStefano Zampini   }
17284222ddf1SHong Zhang   PetscFunctionReturn(0);
17294222ddf1SHong Zhang }
17306718818eSStefano Zampini 
17314222ddf1SHong Zhang /* ------------------------------------------------------- */
17324222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
17334222ddf1SHong Zhang {
17344222ddf1SHong Zhang   PetscFunctionBegin;
17354222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17364222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17374222ddf1SHong Zhang   PetscFunctionReturn(0);
17384222ddf1SHong Zhang }
17394222ddf1SHong Zhang 
17404222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
17414222ddf1SHong Zhang {
17424222ddf1SHong Zhang   PetscErrorCode ierr;
17434222ddf1SHong Zhang   Mat_Product    *product = C->product;
17444222ddf1SHong Zhang 
17454222ddf1SHong Zhang   PetscFunctionBegin;
17464222ddf1SHong Zhang   if (product->type == MATPRODUCT_AB) {
17474222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
17486718818eSStefano Zampini   }
17494222ddf1SHong Zhang   PetscFunctionReturn(0);
17504222ddf1SHong Zhang }
17514222ddf1SHong Zhang /* ------------------------------------------------------- */
17524222ddf1SHong Zhang 
1753b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1754c8db22f6SHong Zhang {
1755c8db22f6SHong Zhang   PetscErrorCode ierr;
17562f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
17572f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
17582f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
17592f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
17602f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
17612f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1762c8db22f6SHong Zhang 
1763c8db22f6SHong Zhang   PetscFunctionBegin;
17642f5f1f90SHong Zhang   btval_den=btdense->v;
1765580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
17662f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
17672f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17682f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1769d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17702f5f1f90SHong Zhang       btcol = bj + bi[col];
17712f5f1f90SHong Zhang       btval = ba + bi[col];
17722f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1773d2853540SHong Zhang       for (j=0; j<anz; j++) {
17742f5f1f90SHong Zhang         brow            = btcol[j];
17752f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1776c8db22f6SHong Zhang       }
1777c8db22f6SHong Zhang     }
17782f5f1f90SHong Zhang     btval_den += m;
1779c8db22f6SHong Zhang   }
1780c8db22f6SHong Zhang   PetscFunctionReturn(0);
1781c8db22f6SHong Zhang }
1782c8db22f6SHong Zhang 
1783b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
17848972f759SHong Zhang {
17858972f759SHong Zhang   PetscErrorCode    ierr;
1786b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
17871683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
17881683a169SBarry Smith   PetscScalar       *ca=csp->a;
1789f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1790e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1791077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1792077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
17938972f759SHong Zhang 
17948972f759SHong Zhang   PetscFunctionBegin;
17951683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1796f99a636bSHong Zhang 
1797077f23c2SHong Zhang   if (brows > 0) {
1798077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1799980ae229SHong Zhang     lstart = matcoloring->lstart;
1800580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1801eeb4fd02SHong Zhang 
1802077f23c2SHong Zhang     row_end = brows;
1803eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1804077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1805077f23c2SHong Zhang       ca_den_ptr = ca_den;
1806980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1807eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1808eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1809eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1810eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1811eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1812eeb4fd02SHong Zhang             lstart[k] = l;
1813eeb4fd02SHong Zhang             break;
1814eeb4fd02SHong Zhang           } else {
1815077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1816eeb4fd02SHong Zhang           }
1817eeb4fd02SHong Zhang         }
1818077f23c2SHong Zhang         ca_den_ptr += m;
1819eeb4fd02SHong Zhang       }
1820077f23c2SHong Zhang       row_end += brows;
1821eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1822eeb4fd02SHong Zhang     }
1823077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1824077f23c2SHong Zhang     ca_den_ptr = ca_den;
1825077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1826077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1827077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1828077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1829077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1830077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1831077f23c2SHong Zhang       }
1832077f23c2SHong Zhang       ca_den_ptr += m;
1833077f23c2SHong Zhang     }
1834f99a636bSHong Zhang   }
1835f99a636bSHong Zhang 
18361683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1837f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1838077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18397d3de750SJacob Faibussowitsch     ierr = PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows);CHKERRQ(ierr);
1840e88777f2SHong Zhang   } else {
1841077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1842e88777f2SHong Zhang   }
1843f99a636bSHong Zhang #endif
18448972f759SHong Zhang   PetscFunctionReturn(0);
18458972f759SHong Zhang }
18468972f759SHong Zhang 
1847b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1848b1683b59SHong Zhang {
1849b1683b59SHong Zhang   PetscErrorCode ierr;
1850e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
18511a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1852b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1853b1683b59SHong Zhang   IS             *isa;
1854b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1855e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1856e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1857e88777f2SHong Zhang   PetscBool      flg;
1858b1683b59SHong Zhang 
1859b1683b59SHong Zhang   PetscFunctionBegin;
1860071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1861e99be685SHong Zhang 
18627ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1863e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1864b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1865e88777f2SHong Zhang   c->N      = Nbs;
1866e88777f2SHong Zhang   c->m      = c->M;
1867b1683b59SHong Zhang   c->rstart = 0;
1868e88777f2SHong Zhang   c->brows  = 100;
1869b1683b59SHong Zhang 
1870b1683b59SHong Zhang   c->ncolors = nis;
1871dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1872854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1873854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1874e88777f2SHong Zhang 
1875e88777f2SHong Zhang   brows = c->brows;
1876c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1877e88777f2SHong Zhang   if (flg) c->brows = brows;
1878eeb4fd02SHong Zhang   if (brows > 0) {
1879854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1880e88777f2SHong Zhang   }
18812205254eSKarl Rupp 
1882d2853540SHong Zhang   colorforrow[0] = 0;
1883d2853540SHong Zhang   rows_i         = rows;
1884f99a636bSHong Zhang   den2sp_i       = den2sp;
1885b1683b59SHong Zhang 
1886854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1887854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
18882205254eSKarl Rupp 
1889d2853540SHong Zhang   colorforcol[0] = 0;
1890d2853540SHong Zhang   columns_i      = columns;
1891d2853540SHong Zhang 
1892077f23c2SHong Zhang   /* get column-wise storage of mat */
1893077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1894b1683b59SHong Zhang 
1895b28d80bdSHong Zhang   cm   = c->m;
1896854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1897854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1898f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1899b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1900b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
19012205254eSKarl Rupp 
1902b1683b59SHong Zhang     c->ncolumns[i] = n;
1903b1683b59SHong Zhang     if (n) {
1904580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1905b1683b59SHong Zhang     }
1906d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1907d2853540SHong Zhang     columns_i       += n;
1908b1683b59SHong Zhang 
1909b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1910580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1911e99be685SHong Zhang 
1912b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1913b1683b59SHong Zhang       col     = is[j];
1914d2853540SHong Zhang       row_idx = cj + ci[col];
1915b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1916b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1917e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1918d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1919b1683b59SHong Zhang       }
1920b1683b59SHong Zhang     }
1921b1683b59SHong Zhang     /* count the number of hits */
1922b1683b59SHong Zhang     nrows = 0;
1923e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1924b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1925b1683b59SHong Zhang     }
1926b1683b59SHong Zhang     c->nrows[i]      = nrows;
1927d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1928d2853540SHong Zhang 
1929b1683b59SHong Zhang     nrows = 0;
1930b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1931b1683b59SHong Zhang       if (rowhit[j]) {
1932d2853540SHong Zhang         rows_i[nrows]   = j;
193312b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1934b1683b59SHong Zhang         nrows++;
1935b1683b59SHong Zhang       }
1936b1683b59SHong Zhang     }
1937e88777f2SHong Zhang     den2sp_i += nrows;
1938077f23c2SHong Zhang 
1939b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1940f99a636bSHong Zhang     rows_i += nrows;
1941b1683b59SHong Zhang   }
19420298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1943b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1944071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
19452c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1946b28d80bdSHong Zhang 
1947d2853540SHong Zhang   c->colorforrow = colorforrow;
1948d2853540SHong Zhang   c->rows        = rows;
1949f99a636bSHong Zhang   c->den2sp      = den2sp;
1950d2853540SHong Zhang   c->colorforcol = colorforcol;
1951d2853540SHong Zhang   c->columns     = columns;
19522205254eSKarl Rupp 
1953f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1954b1683b59SHong Zhang   PetscFunctionReturn(0);
1955b1683b59SHong Zhang }
1956d013fc79Sandi selinger 
19574222ddf1SHong Zhang /* --------------------------------------------------------------- */
19584222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1959df97dc6dSFande Kong {
19604222ddf1SHong Zhang   PetscErrorCode ierr;
19614222ddf1SHong Zhang   Mat_Product    *product = C->product;
19624222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19634222ddf1SHong Zhang 
1964df97dc6dSFande Kong   PetscFunctionBegin;
19654222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19664222ddf1SHong Zhang     /* Alg: "outerproduct" */
19676718818eSStefano Zampini     ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
19684222ddf1SHong Zhang   } else {
19694222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19706718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19716718818eSStefano Zampini     Mat                 At;
19724222ddf1SHong Zhang 
19732c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
19746718818eSStefano Zampini     At = atb->At;
1975089a957eSStefano Zampini     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
19764222ddf1SHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
19774222ddf1SHong Zhang     }
1978089a957eSStefano Zampini     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr);
19794222ddf1SHong Zhang     atb->updateAt = PETSC_TRUE;
19804222ddf1SHong Zhang   }
1981df97dc6dSFande Kong   PetscFunctionReturn(0);
1982df97dc6dSFande Kong }
1983df97dc6dSFande Kong 
19844222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1985d013fc79Sandi selinger {
1986d013fc79Sandi selinger   PetscErrorCode ierr;
19874222ddf1SHong Zhang   Mat_Product    *product = C->product;
19884222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19894222ddf1SHong Zhang   PetscReal      fill=product->fill;
1990d013fc79Sandi selinger 
1991d013fc79Sandi selinger   PetscFunctionBegin;
19924222ddf1SHong Zhang   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
19932869b61bSandi selinger 
19944222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19954222ddf1SHong Zhang   PetscFunctionReturn(0);
19962869b61bSandi selinger }
1997d013fc79Sandi selinger 
19984222ddf1SHong Zhang /* --------------------------------------------------------------- */
19994222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
20004222ddf1SHong Zhang {
20014222ddf1SHong Zhang   PetscErrorCode ierr;
20024222ddf1SHong Zhang   Mat_Product    *product = C->product;
20034222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20044222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20054222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20064222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
20074222ddf1SHong Zhang   PetscInt       nalg = 7;
20084222ddf1SHong Zhang #else
20094222ddf1SHong Zhang   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
20104222ddf1SHong Zhang   PetscInt       nalg = 8;
20114222ddf1SHong Zhang #endif
20124222ddf1SHong Zhang 
20134222ddf1SHong Zhang   PetscFunctionBegin;
20144222ddf1SHong Zhang   /* Set default algorithm */
20154222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20164222ddf1SHong Zhang   if (flg) {
20174222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2018d013fc79Sandi selinger   }
2019d013fc79Sandi selinger 
20204222ddf1SHong Zhang   /* Get runtime option */
20214222ddf1SHong Zhang   if (product->api_user) {
20224222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
20234222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20244222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20254222ddf1SHong Zhang   } else {
20264222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
20273e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
20284222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2029d013fc79Sandi selinger   }
20304222ddf1SHong Zhang   if (flg) {
20314222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2032d013fc79Sandi selinger   }
2033d013fc79Sandi selinger 
20344222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20354222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20364222ddf1SHong Zhang   PetscFunctionReturn(0);
20374222ddf1SHong Zhang }
2038d013fc79Sandi selinger 
20394222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
20404222ddf1SHong Zhang {
20414222ddf1SHong Zhang   PetscErrorCode ierr;
20424222ddf1SHong Zhang   Mat_Product    *product = C->product;
20434222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20444222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
2045089a957eSStefano Zampini   const char     *algTypes[3] = {"default","at*b","outerproduct"};
2046089a957eSStefano Zampini   PetscInt       nalg = 3;
2047d013fc79Sandi selinger 
20484222ddf1SHong Zhang   PetscFunctionBegin;
20494222ddf1SHong Zhang   /* Get runtime option */
20504222ddf1SHong Zhang   if (product->api_user) {
20514222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
20524222ddf1SHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20534222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20544222ddf1SHong Zhang   } else {
20554222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
20563e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20574222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20584222ddf1SHong Zhang   }
20594222ddf1SHong Zhang   if (flg) {
20604222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20614222ddf1SHong Zhang   }
2062d013fc79Sandi selinger 
20634222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20644222ddf1SHong Zhang   PetscFunctionReturn(0);
20654222ddf1SHong Zhang }
20664222ddf1SHong Zhang 
20674222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
20684222ddf1SHong Zhang {
20694222ddf1SHong Zhang   PetscErrorCode ierr;
20704222ddf1SHong Zhang   Mat_Product    *product = C->product;
20714222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20724222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20734222ddf1SHong Zhang   const char     *algTypes[2] = {"default","color"};
20744222ddf1SHong Zhang   PetscInt       nalg = 2;
20754222ddf1SHong Zhang 
20764222ddf1SHong Zhang   PetscFunctionBegin;
20774222ddf1SHong Zhang   /* Set default algorithm */
20784222ddf1SHong Zhang   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
20794222ddf1SHong Zhang   if (!flg) {
20804222ddf1SHong Zhang     alg = 1;
20814222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20824222ddf1SHong Zhang   }
20834222ddf1SHong Zhang 
20844222ddf1SHong Zhang   /* Get runtime option */
20854222ddf1SHong Zhang   if (product->api_user) {
20864222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
20874222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20884222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20894222ddf1SHong Zhang   } else {
20904222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
20913e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
20924222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
20934222ddf1SHong Zhang   }
20944222ddf1SHong Zhang   if (flg) {
20954222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
20964222ddf1SHong Zhang   }
20974222ddf1SHong Zhang 
20984222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20994222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
21004222ddf1SHong Zhang   PetscFunctionReturn(0);
21014222ddf1SHong Zhang }
21024222ddf1SHong Zhang 
21034222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
21044222ddf1SHong Zhang {
21054222ddf1SHong Zhang   PetscErrorCode ierr;
21064222ddf1SHong Zhang   Mat_Product    *product = C->product;
21074222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21084222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
21094222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
21104222ddf1SHong Zhang   const char     *algTypes[2] = {"scalable","rap"};
21114222ddf1SHong Zhang   PetscInt       nalg = 2;
21124222ddf1SHong Zhang #else
21134222ddf1SHong Zhang   const char     *algTypes[3] = {"scalable","rap","hypre"};
21144222ddf1SHong Zhang   PetscInt       nalg = 3;
21154222ddf1SHong Zhang #endif
21164222ddf1SHong Zhang 
21174222ddf1SHong Zhang   PetscFunctionBegin;
21184222ddf1SHong Zhang   /* Set default algorithm */
21194222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21204222ddf1SHong Zhang   if (flg) {
21214222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21224222ddf1SHong Zhang   }
21234222ddf1SHong Zhang 
21244222ddf1SHong Zhang   /* Get runtime option */
21254222ddf1SHong Zhang   if (product->api_user) {
21264222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
21274222ddf1SHong Zhang     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21284222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21294222ddf1SHong Zhang   } else {
21304222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
21313e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21324222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21334222ddf1SHong Zhang   }
21344222ddf1SHong Zhang   if (flg) {
21354222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21364222ddf1SHong Zhang   }
21374222ddf1SHong Zhang 
21384222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21394222ddf1SHong Zhang   PetscFunctionReturn(0);
21404222ddf1SHong Zhang }
21414222ddf1SHong Zhang 
21424222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
21434222ddf1SHong Zhang {
21444222ddf1SHong Zhang   PetscErrorCode ierr;
21454222ddf1SHong Zhang   Mat_Product    *product = C->product;
21464222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21474222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21484222ddf1SHong Zhang   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
21494222ddf1SHong Zhang   PetscInt        nalg = 3;
21504222ddf1SHong Zhang 
21514222ddf1SHong Zhang   PetscFunctionBegin;
21524222ddf1SHong Zhang   /* Set default algorithm */
21534222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21544222ddf1SHong Zhang   if (flg) {
21554222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21564222ddf1SHong Zhang   }
21574222ddf1SHong Zhang 
21584222ddf1SHong Zhang   /* Get runtime option */
21594222ddf1SHong Zhang   if (product->api_user) {
21604222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
21614222ddf1SHong Zhang     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21624222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21634222ddf1SHong Zhang   } else {
21644222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
21653e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
21664222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21674222ddf1SHong Zhang   }
21684222ddf1SHong Zhang   if (flg) {
21694222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21704222ddf1SHong Zhang   }
21714222ddf1SHong Zhang 
21724222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21734222ddf1SHong Zhang   PetscFunctionReturn(0);
21744222ddf1SHong Zhang }
21754222ddf1SHong Zhang 
21764222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
21774222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
21784222ddf1SHong Zhang {
21794222ddf1SHong Zhang   PetscErrorCode ierr;
21804222ddf1SHong Zhang   Mat_Product    *product = C->product;
21814222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21824222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21834222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
21844222ddf1SHong Zhang   PetscInt       nalg = 7;
21854222ddf1SHong Zhang 
21864222ddf1SHong Zhang   PetscFunctionBegin;
21874222ddf1SHong Zhang   /* Set default algorithm */
21884222ddf1SHong Zhang   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
21894222ddf1SHong Zhang   if (flg) {
21904222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
21914222ddf1SHong Zhang   }
21924222ddf1SHong Zhang 
21934222ddf1SHong Zhang   /* Get runtime option */
21944222ddf1SHong Zhang   if (product->api_user) {
21954222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
21964222ddf1SHong Zhang     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
21974222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21984222ddf1SHong Zhang   } else {
21994222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
22003e662e0bSHong Zhang     ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
22014222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
22024222ddf1SHong Zhang   }
22034222ddf1SHong Zhang   if (flg) {
22044222ddf1SHong Zhang     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
22054222ddf1SHong Zhang   }
22064222ddf1SHong Zhang 
22074222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
22084222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
22094222ddf1SHong Zhang   PetscFunctionReturn(0);
22104222ddf1SHong Zhang }
22114222ddf1SHong Zhang 
22124222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
22134222ddf1SHong Zhang {
22144222ddf1SHong Zhang   PetscErrorCode ierr;
22154222ddf1SHong Zhang   Mat_Product    *product = C->product;
22164222ddf1SHong Zhang 
22174222ddf1SHong Zhang   PetscFunctionBegin;
22184222ddf1SHong Zhang   switch (product->type) {
22194222ddf1SHong Zhang   case MATPRODUCT_AB:
22204222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
22214222ddf1SHong Zhang     break;
22224222ddf1SHong Zhang   case MATPRODUCT_AtB:
22234222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
22244222ddf1SHong Zhang     break;
22254222ddf1SHong Zhang   case MATPRODUCT_ABt:
22264222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
22274222ddf1SHong Zhang     break;
22284222ddf1SHong Zhang   case MATPRODUCT_PtAP:
22294222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
22304222ddf1SHong Zhang     break;
22314222ddf1SHong Zhang   case MATPRODUCT_RARt:
22324222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
22334222ddf1SHong Zhang     break;
22344222ddf1SHong Zhang   case MATPRODUCT_ABC:
22354222ddf1SHong Zhang     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
22364222ddf1SHong Zhang     break;
22376718818eSStefano Zampini   default:
22386718818eSStefano Zampini     break;
22394222ddf1SHong Zhang   }
2240d013fc79Sandi selinger   PetscFunctionReturn(0);
2241d013fc79Sandi selinger }
2242