xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision a4af7ca8b2febdbbfac63dec000b7c38d6aff359)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
135e5acdf2Sstefano_zampini 
14150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1538baddfdSBarry Smith {
16dfbe8321SBarry Smith   PetscErrorCode ierr;
17df97dc6dSFande Kong 
18df97dc6dSFande Kong   PetscFunctionBegin;
19df97dc6dSFande Kong   if (scall == MAT_INITIAL_MATRIX) {
20df97dc6dSFande Kong     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
21df97dc6dSFande Kong     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
22df97dc6dSFande Kong     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
23df97dc6dSFande Kong   }
24df97dc6dSFande Kong   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
25df97dc6dSFande Kong   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
26df97dc6dSFande Kong   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
27df97dc6dSFande Kong   PetscFunctionReturn(0);
28df97dc6dSFande Kong }
29df97dc6dSFande Kong 
30df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
31df97dc6dSFande Kong {
32df97dc6dSFande Kong   PetscErrorCode ierr;
33df97dc6dSFande Kong 
34df97dc6dSFande Kong   PetscFunctionBegin;
35df97dc6dSFande Kong   if (C->ops->matmultnumeric) {
36df97dc6dSFande Kong     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
37df97dc6dSFande Kong   } else {
38df97dc6dSFande Kong     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
39df97dc6dSFande Kong   }
40df97dc6dSFande Kong   PetscFunctionReturn(0);
41df97dc6dSFande Kong }
42df97dc6dSFande Kong 
43df97dc6dSFande Kong PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
44df97dc6dSFande Kong {
45df97dc6dSFande Kong   PetscErrorCode ierr;
465e5acdf2Sstefano_zampini #if !defined(PETSC_HAVE_HYPRE)
47d7ed1a4dSandi selinger   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"};
48d013fc79Sandi selinger   PetscInt       nalg = 8;
49d7ed1a4dSandi selinger #else
50d7ed1a4dSandi selinger   const char     *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"};
51d7ed1a4dSandi selinger   PetscInt       nalg = 9;
525e5acdf2Sstefano_zampini #endif
536540a9cdSHong Zhang   PetscInt       alg = 0; /* set default algorithm */
545c66b693SKris Buschelman 
555c66b693SKris Buschelman   PetscFunctionBegin;
56715a5346SHong Zhang   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
575e5acdf2Sstefano_zampini   ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr);
58d8bbc50fSBarry Smith   ierr = PetscOptionsEnd();CHKERRQ(ierr);
596540a9cdSHong Zhang   switch (alg) {
606540a9cdSHong Zhang   case 1:
61aacf854cSBarry Smith     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
626540a9cdSHong Zhang     break;
636540a9cdSHong Zhang   case 2:
646540a9cdSHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
656540a9cdSHong Zhang     break;
666540a9cdSHong Zhang   case 3:
670ced3a2bSJed Brown     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
686540a9cdSHong Zhang     break;
696540a9cdSHong Zhang   case 4:
708a07c6f1SJed Brown     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
716540a9cdSHong Zhang     break;
726540a9cdSHong Zhang   case 5:
7358cf0668SJed Brown     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
746540a9cdSHong Zhang     break;
755e5acdf2Sstefano_zampini   case 6:
76d013fc79Sandi selinger     ierr = MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);CHKERRQ(ierr);
77d013fc79Sandi selinger     break;
78d013fc79Sandi selinger   case 7:
79d7ed1a4dSandi selinger     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
80d7ed1a4dSandi selinger     break;
81d7ed1a4dSandi selinger #if defined(PETSC_HAVE_HYPRE)
82d7ed1a4dSandi selinger   case 8:
835e5acdf2Sstefano_zampini     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
845e5acdf2Sstefano_zampini     break;
855e5acdf2Sstefano_zampini #endif
866540a9cdSHong Zhang   default:
87df97dc6dSFande Kong     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
886540a9cdSHong Zhang     break;
8925023028SHong Zhang   }
905c66b693SKris Buschelman   PetscFunctionReturn(0);
915c66b693SKris Buschelman }
921c24bd37SHong Zhang 
93df97dc6dSFande Kong PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
94b561aa9dSHong Zhang {
95b561aa9dSHong Zhang   PetscErrorCode     ierr;
96b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
97971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
98c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
99b561aa9dSHong Zhang   PetscReal          afill;
100eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
101eca6b86aSHong Zhang   PetscTable         ta;
102fb908031SHong Zhang   PetscBT            lnkbt;
1030298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
104b561aa9dSHong Zhang 
105b561aa9dSHong Zhang   PetscFunctionBegin;
106bd958071SHong Zhang   /* Get ci and cj */
107bd958071SHong Zhang   /*---------------*/
108fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
109fb908031SHong Zhang   /* free space for accumulating nonzero column info */
110854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
111fb908031SHong Zhang   ci[0] = 0;
112fb908031SHong Zhang 
113fb908031SHong Zhang   /* create and initialize a linked list */
114c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
115c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
116eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
117eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
118eca6b86aSHong Zhang 
119eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
120fb908031SHong Zhang 
121fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
122f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1232205254eSKarl Rupp 
124fb908031SHong Zhang   current_space = free_space;
125fb908031SHong Zhang 
126fb908031SHong Zhang   /* Determine ci and cj */
127fb908031SHong Zhang   for (i=0; i<am; i++) {
128fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
129fb908031SHong Zhang     aj   = a->j + ai[i];
130fb908031SHong Zhang     for (j=0; j<anzi; j++) {
131fb908031SHong Zhang       brow = aj[j];
132fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
133fb908031SHong Zhang       bj   = b->j + bi[brow];
134fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
135fb908031SHong Zhang       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
136fb908031SHong Zhang     }
137fb908031SHong Zhang     cnzi = lnk[0];
138fb908031SHong Zhang 
139fb908031SHong Zhang     /* If free space is not available, make more free space */
140fb908031SHong Zhang     /* Double the amount of total space in the list */
141fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
142f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
143fb908031SHong Zhang       ndouble++;
144fb908031SHong Zhang     }
145fb908031SHong Zhang 
146fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
147fb908031SHong Zhang     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1482205254eSKarl Rupp 
149fb908031SHong Zhang     current_space->array           += cnzi;
150fb908031SHong Zhang     current_space->local_used      += cnzi;
151fb908031SHong Zhang     current_space->local_remaining -= cnzi;
1522205254eSKarl Rupp 
153fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
154fb908031SHong Zhang   }
155fb908031SHong Zhang 
156fb908031SHong Zhang   /* Column indices are in the list of free space */
157fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
158fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
159854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
160fb908031SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
161fb908031SHong Zhang   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
162b561aa9dSHong Zhang 
16326be0446SHong Zhang   /* put together the new symbolic matrix */
164ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
16533d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
16602fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
16758c24d83SHong Zhang 
16858c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
16958c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
17058c24d83SHong Zhang   c                         = (Mat_SeqAIJ*)((*C)->data);
171aa1aec99SHong Zhang   c->free_a                 = PETSC_FALSE;
172e6b907acSBarry Smith   c->free_ij                = PETSC_TRUE;
17358c24d83SHong Zhang   c->nonew                  = 0;
174df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; /* fast, needs non-scalable O(bn) array 'abdense' */
1750b7e3e3dSHong Zhang 
1767212da7cSHong Zhang   /* set MatInfo */
1777212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
178f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
1797212da7cSHong Zhang   c->maxnz                     = ci[am];
1807212da7cSHong Zhang   c->nz                        = ci[am];
181fb908031SHong Zhang   (*C)->info.mallocs           = ndouble;
1827212da7cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
1837212da7cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
1847212da7cSHong Zhang 
1857212da7cSHong Zhang #if defined(PETSC_USE_INFO)
1867212da7cSHong Zhang   if (ci[am]) {
18757622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
18857622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
189f2b054eeSHong Zhang   } else {
190f2b054eeSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
191be0fcf8dSHong Zhang   }
192f2b054eeSHong Zhang #endif
19358c24d83SHong Zhang   PetscFunctionReturn(0);
19458c24d83SHong Zhang }
195d50806bdSBarry Smith 
196df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
197d50806bdSBarry Smith {
198dfbe8321SBarry Smith   PetscErrorCode ierr;
199d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
200d50806bdSBarry Smith   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
201d50806bdSBarry Smith   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
202d50806bdSBarry Smith   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
20338baddfdSBarry Smith   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
204c58ca2e3SHong Zhang   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
205519eb980SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
206aa1aec99SHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
207aa1aec99SHong Zhang   PetscScalar    *ab_dense;
208d50806bdSBarry Smith 
209d50806bdSBarry Smith   PetscFunctionBegin;
210aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
211854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
212aa1aec99SHong Zhang     c->a      = ca;
213aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
214aa1aec99SHong Zhang   } else {
215aa1aec99SHong Zhang     ca        = c->a;
216d90d86d0SMatthew G. Knepley   }
217d90d86d0SMatthew G. Knepley   if (!c->matmult_abdense) {
2181795a4d1SJed Brown     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
219d90d86d0SMatthew G. Knepley     c->matmult_abdense = ab_dense;
220d90d86d0SMatthew G. Knepley   } else {
221aa1aec99SHong Zhang     ab_dense = c->matmult_abdense;
222aa1aec99SHong Zhang   }
223aa1aec99SHong Zhang 
224c124e916SHong Zhang   /* clean old values in C */
225580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
226d50806bdSBarry Smith   /* Traverse A row-wise. */
227d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
228d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
229d50806bdSBarry Smith   for (i=0; i<am; i++) {
230d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
231d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
232519eb980SHong Zhang       brow = aj[j];
233d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
234d50806bdSBarry Smith       bjj  = bj + bi[brow];
235d50806bdSBarry Smith       baj  = ba + bi[brow];
236519eb980SHong Zhang       /* perform dense axpy */
23736ec6d2dSHong Zhang       valtmp = aa[j];
238519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
23936ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
240519eb980SHong Zhang       }
241519eb980SHong Zhang       flops += 2*bnzi;
242519eb980SHong Zhang     }
243c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
244c58ca2e3SHong Zhang 
245c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
246519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
247519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
248519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
249519eb980SHong Zhang     }
250519eb980SHong Zhang     flops += cnzi;
251519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
252519eb980SHong Zhang   }
253c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
254c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
255c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
256c58ca2e3SHong Zhang   PetscFunctionReturn(0);
257c58ca2e3SHong Zhang }
258c58ca2e3SHong Zhang 
25925023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
260c58ca2e3SHong Zhang {
261c58ca2e3SHong Zhang   PetscErrorCode ierr;
262c58ca2e3SHong Zhang   PetscLogDouble flops=0.0;
263c58ca2e3SHong Zhang   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
264c58ca2e3SHong Zhang   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
265c58ca2e3SHong Zhang   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
266c58ca2e3SHong Zhang   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
267c58ca2e3SHong Zhang   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
268c58ca2e3SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
26936ec6d2dSHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
270c58ca2e3SHong Zhang   PetscInt       nextb;
271c58ca2e3SHong Zhang 
272c58ca2e3SHong Zhang   PetscFunctionBegin;
273cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
274cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
275cf742fccSHong Zhang     c->a      = ca;
276cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
277cf742fccSHong Zhang   }
278cf742fccSHong Zhang 
279c58ca2e3SHong Zhang   /* clean old values in C */
280580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
281c58ca2e3SHong Zhang   /* Traverse A row-wise. */
282c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
283c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
284519eb980SHong Zhang   for (i=0; i<am; i++) {
285519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
286519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
287519eb980SHong Zhang     for (j=0; j<anzi; j++) {
288519eb980SHong Zhang       brow = aj[j];
289519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
290519eb980SHong Zhang       bjj  = bj + bi[brow];
291519eb980SHong Zhang       baj  = ba + bi[brow];
292519eb980SHong Zhang       /* perform sparse axpy */
29336ec6d2dSHong Zhang       valtmp = aa[j];
294c124e916SHong Zhang       nextb  = 0;
295c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
296c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
29736ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
298c124e916SHong Zhang         }
299d50806bdSBarry Smith       }
300d50806bdSBarry Smith       flops += 2*bnzi;
301d50806bdSBarry Smith     }
302519eb980SHong Zhang     aj += anzi; aa += anzi;
303519eb980SHong Zhang     cj += cnzi; ca += cnzi;
304d50806bdSBarry Smith   }
305c58ca2e3SHong Zhang 
306716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
307716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
308d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
309d50806bdSBarry Smith   PetscFunctionReturn(0);
310d50806bdSBarry Smith }
311bc011b1eSHong Zhang 
3123c50cad2SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
31325296bd5SBarry Smith {
31425296bd5SBarry Smith   PetscErrorCode     ierr;
31525296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
31625296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3173c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
31825296bd5SBarry Smith   MatScalar          *ca;
31925296bd5SBarry Smith   PetscReal          afill;
320eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
321eca6b86aSHong Zhang   PetscTable         ta;
3220298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
32325296bd5SBarry Smith 
32425296bd5SBarry Smith   PetscFunctionBegin;
3253c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3263c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3273c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
328854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
3293c50cad2SHong Zhang   ci[0] = 0;
3303c50cad2SHong Zhang 
3313c50cad2SHong Zhang   /* create and initialize a linked list */
332c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
333c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
334eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
335eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
336eca6b86aSHong Zhang 
337eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
3383c50cad2SHong Zhang 
3393c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
340f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
3413c50cad2SHong Zhang   current_space = free_space;
3423c50cad2SHong Zhang 
3433c50cad2SHong Zhang   /* Determine ci and cj */
3443c50cad2SHong Zhang   for (i=0; i<am; i++) {
3453c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
3463c50cad2SHong Zhang     aj   = a->j + ai[i];
3473c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
3483c50cad2SHong Zhang       brow = aj[j];
3493c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
3503c50cad2SHong Zhang       bj   = b->j + bi[brow];
3513c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
3523c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
3533c50cad2SHong Zhang     }
3543c50cad2SHong Zhang     cnzi = lnk[1];
3553c50cad2SHong Zhang 
3563c50cad2SHong Zhang     /* If free space is not available, make more free space */
3573c50cad2SHong Zhang     /* Double the amount of total space in the list */
3583c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
359f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
3603c50cad2SHong Zhang       ndouble++;
3613c50cad2SHong Zhang     }
3623c50cad2SHong Zhang 
3633c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
3643c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
3652205254eSKarl Rupp 
3663c50cad2SHong Zhang     current_space->array           += cnzi;
3673c50cad2SHong Zhang     current_space->local_used      += cnzi;
3683c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
3692205254eSKarl Rupp 
3703c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
3713c50cad2SHong Zhang   }
3723c50cad2SHong Zhang 
3733c50cad2SHong Zhang   /* Column indices are in the list of free space */
3743c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
3753c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
376854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
3773c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
3783c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
37925296bd5SBarry Smith 
38025296bd5SBarry Smith   /* Allocate space for ca */
381580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
38225296bd5SBarry Smith 
38325296bd5SBarry Smith   /* put together the new symbolic matrix */
384ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
38533d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
38602fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
38725296bd5SBarry Smith 
38825296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
38925296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
39025296bd5SBarry Smith   c          = (Mat_SeqAIJ*)((*C)->data);
39125296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
39225296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
39325296bd5SBarry Smith   c->nonew   = 0;
3942205254eSKarl Rupp 
39525296bd5SBarry Smith   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
39625296bd5SBarry Smith 
39725296bd5SBarry Smith   /* set MatInfo */
39825296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
39925296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
40025296bd5SBarry Smith   c->maxnz                     = ci[am];
40125296bd5SBarry Smith   c->nz                        = ci[am];
4023c50cad2SHong Zhang   (*C)->info.mallocs           = ndouble;
40325296bd5SBarry Smith   (*C)->info.fill_ratio_given  = fill;
40425296bd5SBarry Smith   (*C)->info.fill_ratio_needed = afill;
40525296bd5SBarry Smith 
40625296bd5SBarry Smith #if defined(PETSC_USE_INFO)
40725296bd5SBarry Smith   if (ci[am]) {
40857622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
40957622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
41025296bd5SBarry Smith   } else {
41125296bd5SBarry Smith     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
41225296bd5SBarry Smith   }
41325296bd5SBarry Smith #endif
41425296bd5SBarry Smith   PetscFunctionReturn(0);
41525296bd5SBarry Smith }
41625296bd5SBarry Smith 
41725296bd5SBarry Smith 
41825023028SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
419e9e4536cSHong Zhang {
420e9e4536cSHong Zhang   PetscErrorCode     ierr;
421e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
422bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
42325c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
424e9e4536cSHong Zhang   MatScalar          *ca;
425e9e4536cSHong Zhang   PetscReal          afill;
426eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
427eca6b86aSHong Zhang   PetscTable         ta;
4280298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
429e9e4536cSHong Zhang 
430e9e4536cSHong Zhang   PetscFunctionBegin;
431bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
432bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
433bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
434854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
435bd958071SHong Zhang   ci[0] = 0;
436bd958071SHong Zhang 
437bd958071SHong Zhang   /* create and initialize a linked list */
438c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
439c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
440eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
441eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
442eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
443bd958071SHong Zhang 
444bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
445f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
446bd958071SHong Zhang   current_space = free_space;
447bd958071SHong Zhang 
448bd958071SHong Zhang   /* Determine ci and cj */
449bd958071SHong Zhang   for (i=0; i<am; i++) {
450bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
451bd958071SHong Zhang     aj   = a->j + ai[i];
452bd958071SHong Zhang     for (j=0; j<anzi; j++) {
453bd958071SHong Zhang       brow = aj[j];
454bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
455bd958071SHong Zhang       bj   = b->j + bi[brow];
456bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
457bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
458bd958071SHong Zhang     }
459bd958071SHong Zhang     cnzi = lnk[0];
460bd958071SHong Zhang 
461bd958071SHong Zhang     /* If free space is not available, make more free space */
462bd958071SHong Zhang     /* Double the amount of total space in the list */
463bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
464f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
465bd958071SHong Zhang       ndouble++;
466bd958071SHong Zhang     }
467bd958071SHong Zhang 
468bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
469bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4702205254eSKarl Rupp 
471bd958071SHong Zhang     current_space->array           += cnzi;
472bd958071SHong Zhang     current_space->local_used      += cnzi;
473bd958071SHong Zhang     current_space->local_remaining -= cnzi;
4742205254eSKarl Rupp 
475bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
476bd958071SHong Zhang   }
477bd958071SHong Zhang 
478bd958071SHong Zhang   /* Column indices are in the list of free space */
479bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
480bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
481854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
482bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
483bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
484e9e4536cSHong Zhang 
485e9e4536cSHong Zhang   /* Allocate space for ca */
486bd958071SHong Zhang   /*-----------------------*/
487580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
488e9e4536cSHong Zhang 
489e9e4536cSHong Zhang   /* put together the new symbolic matrix */
490ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
49133d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
49202fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
493e9e4536cSHong Zhang 
494e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
495e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
496e9e4536cSHong Zhang   c          = (Mat_SeqAIJ*)((*C)->data);
497e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
498e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
499e9e4536cSHong Zhang   c->nonew   = 0;
5002205254eSKarl Rupp 
50125023028SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
502e9e4536cSHong Zhang 
503e9e4536cSHong Zhang   /* set MatInfo */
504e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
505e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
506e9e4536cSHong Zhang   c->maxnz                     = ci[am];
507e9e4536cSHong Zhang   c->nz                        = ci[am];
508bd958071SHong Zhang   (*C)->info.mallocs           = ndouble;
509e9e4536cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
510e9e4536cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
511e9e4536cSHong Zhang 
512e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
513e9e4536cSHong Zhang   if (ci[am]) {
51457622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
51557622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
516e9e4536cSHong Zhang   } else {
517e9e4536cSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
518e9e4536cSHong Zhang   }
519e9e4536cSHong Zhang #endif
520e9e4536cSHong Zhang   PetscFunctionReturn(0);
521e9e4536cSHong Zhang }
522e9e4536cSHong Zhang 
5230ced3a2bSJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
5240ced3a2bSJed Brown {
5250ced3a2bSJed Brown   PetscErrorCode     ierr;
5260ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
5270ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
5280ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
5290ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
5300ced3a2bSJed Brown   PetscReal          afill;
5310ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
5320298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
5330ced3a2bSJed Brown   PetscHeap          h;
5340ced3a2bSJed Brown 
5350ced3a2bSJed Brown   PetscFunctionBegin;
536cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
5370ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
5380ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
539854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
5400ced3a2bSJed Brown   ci[0] = 0;
5410ced3a2bSJed Brown 
5420ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
543f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
5440ced3a2bSJed Brown   current_space = free_space;
5450ced3a2bSJed Brown 
5460ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
547785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
5480ced3a2bSJed Brown 
5490ced3a2bSJed Brown   /* Determine ci and cj */
5500ced3a2bSJed Brown   for (i=0; i<am; i++) {
5510ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
5520ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
5530ced3a2bSJed Brown     ci[i+1] = ci[i];
5540ced3a2bSJed Brown     /* Populate the min heap */
5550ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
5560ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
5570ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
5580ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
5590ced3a2bSJed Brown       }
5600ced3a2bSJed Brown     }
5610ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
5620ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5630ced3a2bSJed Brown     while (j >= 0) {
5640ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
565f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
5660ced3a2bSJed Brown         ndouble++;
5670ced3a2bSJed Brown       }
5680ced3a2bSJed Brown       *(current_space->array++) = col;
5690ced3a2bSJed Brown       current_space->local_used++;
5700ced3a2bSJed Brown       current_space->local_remaining--;
5710ced3a2bSJed Brown       ci[i+1]++;
5720ced3a2bSJed Brown 
5730ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
5740ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
5750ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
5760ced3a2bSJed Brown         PetscInt j2,col2;
5770ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
5780ced3a2bSJed Brown         if (col2 != col) break;
5790ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
5800ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
5810ced3a2bSJed Brown       }
5820ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
5830ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
5840ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5850ced3a2bSJed Brown     }
5860ced3a2bSJed Brown   }
5870ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
5880ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
5890ced3a2bSJed Brown 
5900ced3a2bSJed Brown   /* Column indices are in the list of free space */
5910ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
5920ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
593785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
5940ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
5950ced3a2bSJed Brown 
5960ced3a2bSJed Brown   /* put together the new symbolic matrix */
597ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
59833d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
59902fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
6000ced3a2bSJed Brown 
6010ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6020ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6030ced3a2bSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
6040ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6050ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6060ced3a2bSJed Brown   c->nonew   = 0;
60726fbe8dcSKarl Rupp 
608df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6090ced3a2bSJed Brown 
6100ced3a2bSJed Brown   /* set MatInfo */
6110ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6120ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6130ced3a2bSJed Brown   c->maxnz                     = ci[am];
6140ced3a2bSJed Brown   c->nz                        = ci[am];
6150ced3a2bSJed Brown   (*C)->info.mallocs           = ndouble;
6160ced3a2bSJed Brown   (*C)->info.fill_ratio_given  = fill;
6170ced3a2bSJed Brown   (*C)->info.fill_ratio_needed = afill;
6180ced3a2bSJed Brown 
6190ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6200ced3a2bSJed Brown   if (ci[am]) {
62157622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
62257622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
6230ced3a2bSJed Brown   } else {
6240ced3a2bSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
6250ced3a2bSJed Brown   }
6260ced3a2bSJed Brown #endif
6270ced3a2bSJed Brown   PetscFunctionReturn(0);
6280ced3a2bSJed Brown }
629e9e4536cSHong Zhang 
6308a07c6f1SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
6318a07c6f1SJed Brown {
6328a07c6f1SJed Brown   PetscErrorCode     ierr;
6338a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6348a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6358a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
6368a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6378a07c6f1SJed Brown   PetscReal          afill;
6388a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
6390298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6408a07c6f1SJed Brown   PetscHeap          h;
6418a07c6f1SJed Brown   PetscBT            bt;
6428a07c6f1SJed Brown 
6438a07c6f1SJed Brown   PetscFunctionBegin;
644cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
6458a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
6468a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
647854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6488a07c6f1SJed Brown   ci[0] = 0;
6498a07c6f1SJed Brown 
6508a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
651f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6522205254eSKarl Rupp 
6538a07c6f1SJed Brown   current_space = free_space;
6548a07c6f1SJed Brown 
6558a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
656785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6578a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
6588a07c6f1SJed Brown 
6598a07c6f1SJed Brown   /* Determine ci and cj */
6608a07c6f1SJed Brown   for (i=0; i<am; i++) {
6618a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6628a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6638a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
6648a07c6f1SJed Brown     ci[i+1] = ci[i];
6658a07c6f1SJed Brown     /* Populate the min heap */
6668a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
6678a07c6f1SJed Brown       PetscInt brow = acol[j];
6688a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
6698a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6708a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6718a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6728a07c6f1SJed Brown           bb[j]++;
6738a07c6f1SJed Brown           break;
6748a07c6f1SJed Brown         }
6758a07c6f1SJed Brown       }
6768a07c6f1SJed Brown     }
6778a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
6788a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6798a07c6f1SJed Brown     while (j >= 0) {
6808a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6810298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
682f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6838a07c6f1SJed Brown         ndouble++;
6848a07c6f1SJed Brown       }
6858a07c6f1SJed Brown       *(current_space->array++) = col;
6868a07c6f1SJed Brown       current_space->local_used++;
6878a07c6f1SJed Brown       current_space->local_remaining--;
6888a07c6f1SJed Brown       ci[i+1]++;
6898a07c6f1SJed Brown 
6908a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
6918a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
6928a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6938a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6948a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6958a07c6f1SJed Brown           bb[j]++;
6968a07c6f1SJed Brown           break;
6978a07c6f1SJed Brown         }
6988a07c6f1SJed Brown       }
6998a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7008a07c6f1SJed Brown     }
7018a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7028a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7038a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7048a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7058a07c6f1SJed Brown     }
7068a07c6f1SJed Brown   }
7078a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
7088a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7098a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7108a07c6f1SJed Brown 
7118a07c6f1SJed Brown   /* Column indices are in the list of free space */
7128a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7138a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
714785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7158a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7168a07c6f1SJed Brown 
7178a07c6f1SJed Brown   /* put together the new symbolic matrix */
718ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
71933d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
72002fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
7218a07c6f1SJed Brown 
7228a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7238a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7248a07c6f1SJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
7258a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7268a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7278a07c6f1SJed Brown   c->nonew   = 0;
72826fbe8dcSKarl Rupp 
729df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7308a07c6f1SJed Brown 
7318a07c6f1SJed Brown   /* set MatInfo */
7328a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7338a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7348a07c6f1SJed Brown   c->maxnz                     = ci[am];
7358a07c6f1SJed Brown   c->nz                        = ci[am];
7368a07c6f1SJed Brown   (*C)->info.mallocs           = ndouble;
7378a07c6f1SJed Brown   (*C)->info.fill_ratio_given  = fill;
7388a07c6f1SJed Brown   (*C)->info.fill_ratio_needed = afill;
7398a07c6f1SJed Brown 
7408a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7418a07c6f1SJed Brown   if (ci[am]) {
74257622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
74357622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7448a07c6f1SJed Brown   } else {
7458a07c6f1SJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
7468a07c6f1SJed Brown   }
7478a07c6f1SJed Brown #endif
7488a07c6f1SJed Brown   PetscFunctionReturn(0);
7498a07c6f1SJed Brown }
7508a07c6f1SJed Brown 
751d7ed1a4dSandi selinger 
752d7ed1a4dSandi selinger PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C)
753d7ed1a4dSandi selinger {
754d7ed1a4dSandi selinger   PetscErrorCode     ierr;
755d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
756d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
757d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
758d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
759d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
760d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
761d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
762d7ed1a4dSandi selinger   PetscInt           window[8];
763d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
764d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
765d7ed1a4dSandi selinger   PetscReal          afill;
766f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
7677660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
768d7ed1a4dSandi selinger 
769d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
770d7ed1a4dSandi selinger              Because of the way virtual memory works,
771d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
772d7ed1a4dSandi selinger   PetscFunctionBegin;
773d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
774d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
775d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
776d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
777d7ed1a4dSandi selinger     a_rownnz = 0;
778d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
779d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
780d7ed1a4dSandi selinger       if (a_rownnz > bn) {
781d7ed1a4dSandi selinger         a_rownnz = bn;
782d7ed1a4dSandi selinger         break;
783d7ed1a4dSandi selinger       }
784d7ed1a4dSandi selinger     }
785d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
786d7ed1a4dSandi selinger   }
787d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
788d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
789f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
790f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
791d7ed1a4dSandi selinger 
7927660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
7937660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
794d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
795d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
796d7ed1a4dSandi selinger 
797d7ed1a4dSandi selinger   ci_nnz       = 0;
798d7ed1a4dSandi selinger   ci[0]        = 0;
799d7ed1a4dSandi selinger   worki_L1[0]  = 0;
800d7ed1a4dSandi selinger   worki_L2[0]  = 0;
801d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
802d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
803d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
804d7ed1a4dSandi selinger     rowsleft             = anzi;
805d7ed1a4dSandi selinger     inputcol_L1          = acol;
806d7ed1a4dSandi selinger     L2_nnz               = 0;
8077660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8087660c4dbSandi selinger     worki_L2[1]          = 0;
80908fe1b3cSKarl Rupp     outputi_nnz          = 0;
810d7ed1a4dSandi selinger 
811d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
812d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
813d7ed1a4dSandi selinger       c_maxmem *= 2;
814d7ed1a4dSandi selinger       ndouble++;
815d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
816d7ed1a4dSandi selinger     }
817d7ed1a4dSandi selinger 
818d7ed1a4dSandi selinger     while (rowsleft) {
819d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
820d7ed1a4dSandi selinger       L1_nrows    = 0;
8217660c4dbSandi selinger       L1_nnz      = 0;
822d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
823d7ed1a4dSandi selinger       inputi      = bi;
824d7ed1a4dSandi selinger       inputj      = bj;
825d7ed1a4dSandi selinger 
826d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
827d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
828f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
829d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
830d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
831d7ed1a4dSandi selinger          window_min  = bn;                                                   \
8327660c4dbSandi selinger          outputi_nnz = 0;                                                    \
8337660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
834d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
835d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
836d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
837d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
838d7ed1a4dSandi selinger          }                                                                   \
839d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
840d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
841d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
842d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
843d7ed1a4dSandi selinger            window_min = bn;                                                  \
844d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
845d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
846d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
847d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
848d7ed1a4dSandi selinger              }                                                               \
849d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
850d7ed1a4dSandi selinger            }                                                                 \
851d7ed1a4dSandi selinger          }
852d7ed1a4dSandi selinger 
853d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
854d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
855d7ed1a4dSandi selinger       while (L1_rowsleft) {
8567660c4dbSandi selinger         outputi_nnz = 0;
8577660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
8587660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
8597660c4dbSandi selinger 
860d7ed1a4dSandi selinger         switch (L1_rowsleft) {
861d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
862d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
863d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
864d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
865d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
866d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
867d7ed1a4dSandi selinger                  break;
868d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
869d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
870d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
871d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
872d7ed1a4dSandi selinger                  break;
873d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
874d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
875d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
876d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
877d7ed1a4dSandi selinger                  break;
878d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
879d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
880d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
881d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
882d7ed1a4dSandi selinger                  break;
883d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
884d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
885d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
886d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
887d7ed1a4dSandi selinger                  break;
888d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
889d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
890d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
891d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
892d7ed1a4dSandi selinger                  break;
893d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
894d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
895d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
896d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
897d7ed1a4dSandi selinger                  break;
898d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
899d7ed1a4dSandi selinger                  inputcol    += 8;
900d7ed1a4dSandi selinger                  rowsleft    -= 8;
901d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
902d7ed1a4dSandi selinger                  break;
903d7ed1a4dSandi selinger         }
904d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9057660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9067660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
907d7ed1a4dSandi selinger       }
908d7ed1a4dSandi selinger 
909d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
910d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
911d7ed1a4dSandi selinger       if (anzi > 8) {
912d7ed1a4dSandi selinger         inputi      = worki_L1;
913d7ed1a4dSandi selinger         inputj      = workj_L1;
914d7ed1a4dSandi selinger         inputcol    = workcol;
915d7ed1a4dSandi selinger         outputi_nnz = 0;
916d7ed1a4dSandi selinger 
917d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
918d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
919d7ed1a4dSandi selinger 
920d7ed1a4dSandi selinger         switch (L1_nrows) {
921d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
922d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
923d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
924d7ed1a4dSandi selinger                  break;
925d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
926d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
927d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
928d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
929d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
930d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
931d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
932d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
933d7ed1a4dSandi selinger         }
934d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
935d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
936d7ed1a4dSandi selinger 
9377660c4dbSandi selinger         /************************ L E V E L  3 **********************/
9387660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
939d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
940d7ed1a4dSandi selinger           inputi      = worki_L2;
941d7ed1a4dSandi selinger           inputj      = workj_L2;
942d7ed1a4dSandi selinger           inputcol    = workcol;
943d7ed1a4dSandi selinger           outputi_nnz = 0;
944f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
945d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
946d7ed1a4dSandi selinger           switch (L2_nrows) {
947d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
948d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
949d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
950d7ed1a4dSandi selinger                    break;
951d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
952d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
953d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
954d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
955d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
956d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
957d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
958d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
959d7ed1a4dSandi selinger           }
960d7ed1a4dSandi selinger           L2_nrows    = 1;
9617660c4dbSandi selinger           L2_nnz      = outputi_nnz;
9627660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
9637660c4dbSandi selinger           /* Copy to workj_L2 */
964d7ed1a4dSandi selinger           if (rowsleft) {
9657660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
966d7ed1a4dSandi selinger           }
967d7ed1a4dSandi selinger         }
968d7ed1a4dSandi selinger       }
969d7ed1a4dSandi selinger     }  /* while (rowsleft) */
970d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
971d7ed1a4dSandi selinger 
972d7ed1a4dSandi selinger     /* terminate current row */
973d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
974d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
975d7ed1a4dSandi selinger   }
976d7ed1a4dSandi selinger 
977d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
978d7ed1a4dSandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
979d7ed1a4dSandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
980f83700f2Sandi selinger   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
981d7ed1a4dSandi selinger 
982d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
983d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
984d7ed1a4dSandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
985d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
986d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
987d7ed1a4dSandi selinger   c->nonew   = 0;
988d7ed1a4dSandi selinger 
989df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
990d7ed1a4dSandi selinger 
991d7ed1a4dSandi selinger   /* set MatInfo */
992d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
993d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
994d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
995d7ed1a4dSandi selinger   c->nz                        = ci[am];
996d7ed1a4dSandi selinger   (*C)->info.mallocs           = ndouble;
997d7ed1a4dSandi selinger   (*C)->info.fill_ratio_given  = fill;
998d7ed1a4dSandi selinger   (*C)->info.fill_ratio_needed = afill;
999d7ed1a4dSandi selinger 
1000d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1001d7ed1a4dSandi selinger   if (ci[am]) {
1002d7ed1a4dSandi selinger     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1003d7ed1a4dSandi selinger     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1004d7ed1a4dSandi selinger   } else {
1005d7ed1a4dSandi selinger     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
1006d7ed1a4dSandi selinger   }
1007d7ed1a4dSandi selinger #endif
1008d7ed1a4dSandi selinger 
1009d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1010d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1011d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1012f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1013d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1014d7ed1a4dSandi selinger }
1015d7ed1a4dSandi selinger 
1016cd093f1dSJed Brown /* concatenate unique entries and then sort */
1017df97dc6dSFande Kong PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat *C)
1018cd093f1dSJed Brown {
1019cd093f1dSJed Brown   PetscErrorCode     ierr;
1020cd093f1dSJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1021cd093f1dSJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1022cd093f1dSJed Brown   PetscInt           *ci,*cj;
1023cd093f1dSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1024cd093f1dSJed Brown   PetscReal          afill;
1025cd093f1dSJed Brown   PetscInt           i,j,ndouble = 0;
1026cd093f1dSJed Brown   PetscSegBuffer     seg,segrow;
1027cd093f1dSJed Brown   char               *seen;
1028cd093f1dSJed Brown 
1029cd093f1dSJed Brown   PetscFunctionBegin;
1030854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1031cd093f1dSJed Brown   ci[0] = 0;
1032cd093f1dSJed Brown 
1033cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1034cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1035cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1036580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1037cd093f1dSJed Brown 
1038cd093f1dSJed Brown   /* Determine ci and cj */
1039cd093f1dSJed Brown   for (i=0; i<am; i++) {
1040cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1041cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1042cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1043cd093f1dSJed Brown     /* Pack segrow */
1044cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1045cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1046cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
1047cd093f1dSJed Brown         PetscInt bcol = bj[k];
1048cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1049cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1050cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1051cd093f1dSJed Brown           *slot = bcol;
1052cd093f1dSJed Brown           seen[bcol] = 1;
1053cd093f1dSJed Brown           packlen++;
1054cd093f1dSJed Brown         }
1055cd093f1dSJed Brown       }
1056cd093f1dSJed Brown     }
1057cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1058cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1059cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1060cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1061cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1062cd093f1dSJed Brown   }
1063cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1064cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1065cd093f1dSJed Brown 
1066cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1067cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1068cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1069cd093f1dSJed Brown 
1070cd093f1dSJed Brown   /* put together the new symbolic matrix */
1071cd093f1dSJed Brown   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
107233d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
107302fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1074cd093f1dSJed Brown 
1075cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1076cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1077cd093f1dSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
1078cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1079cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1080cd093f1dSJed Brown   c->nonew   = 0;
1081cd093f1dSJed Brown 
1082df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1083cd093f1dSJed Brown 
1084cd093f1dSJed Brown   /* set MatInfo */
1085cd093f1dSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1086cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1087cd093f1dSJed Brown   c->maxnz                     = ci[am];
1088cd093f1dSJed Brown   c->nz                        = ci[am];
1089cd093f1dSJed Brown   (*C)->info.mallocs           = ndouble;
1090cd093f1dSJed Brown   (*C)->info.fill_ratio_given  = fill;
1091cd093f1dSJed Brown   (*C)->info.fill_ratio_needed = afill;
1092cd093f1dSJed Brown 
1093cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1094cd093f1dSJed Brown   if (ci[am]) {
109557622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
109657622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1097cd093f1dSJed Brown   } else {
1098cd093f1dSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
1099cd093f1dSJed Brown   }
1100cd093f1dSJed Brown #endif
1101cd093f1dSJed Brown   PetscFunctionReturn(0);
1102cd093f1dSJed Brown }
1103cd093f1dSJed Brown 
1104d2853540SHong Zhang /* This routine is not used. Should be removed! */
11056fc122caSHong Zhang PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
11065df89d91SHong Zhang {
1107bc011b1eSHong Zhang   PetscErrorCode ierr;
1108bc011b1eSHong Zhang 
1109bc011b1eSHong Zhang   PetscFunctionBegin;
1110bc011b1eSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
11113ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
11126fc122caSHong Zhang     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
11133ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1114bc011b1eSHong Zhang   }
11153ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
11166fc122caSHong Zhang   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
11173ff4c91cSHong Zhang   ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
1118bc011b1eSHong Zhang   PetscFunctionReturn(0);
1119bc011b1eSHong Zhang }
1120bc011b1eSHong Zhang 
11212128a86cSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
11222128a86cSHong Zhang {
11232128a86cSHong Zhang   PetscErrorCode      ierr;
11244c7df5ccSHong Zhang   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
112540192850SHong Zhang   Mat_MatMatTransMult *abt=a->abt;
11262128a86cSHong Zhang 
11272128a86cSHong Zhang   PetscFunctionBegin;
112840192850SHong Zhang   ierr = (abt->destroy)(A);CHKERRQ(ierr);
112940192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
113040192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
113140192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
113240192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
11332128a86cSHong Zhang   PetscFunctionReturn(0);
11342128a86cSHong Zhang }
11352128a86cSHong Zhang 
11366fc122caSHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1137bc011b1eSHong Zhang {
11385df89d91SHong Zhang   PetscErrorCode      ierr;
113981d82fe4SHong Zhang   Mat                 Bt;
114081d82fe4SHong Zhang   PetscInt            *bti,*btj;
114140192850SHong Zhang   Mat_MatMatTransMult *abt;
11424c7df5ccSHong Zhang   Mat_SeqAIJ          *c;
1143d2853540SHong Zhang 
114481d82fe4SHong Zhang   PetscFunctionBegin;
114581d82fe4SHong Zhang   /* create symbolic Bt */
114681d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
11470298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
114833d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
114904b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
115081d82fe4SHong Zhang 
115181d82fe4SHong Zhang   /* get symbolic C=A*Bt */
115281d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
115381d82fe4SHong Zhang 
11542128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1155b00a9115SJed Brown   ierr   = PetscNew(&abt);CHKERRQ(ierr);
11564c7df5ccSHong Zhang   c      = (Mat_SeqAIJ*)(*C)->data;
115740192850SHong Zhang   c->abt = abt;
11582128a86cSHong Zhang 
115940192850SHong Zhang   abt->usecoloring = PETSC_FALSE;
116040192850SHong Zhang   abt->destroy     = (*C)->ops->destroy;
11612128a86cSHong Zhang   (*C)->ops->destroy     = MatDestroy_SeqAIJ_MatMatMultTrans;
11622128a86cSHong Zhang 
1163c5929fdfSBarry Smith   ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr);
116440192850SHong Zhang   if (abt->usecoloring) {
1165b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1166b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1167335efc43SPeter Brune     MatColoring          coloring;
11682128a86cSHong Zhang     ISColoring           iscoloring;
11692128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
11704d478ae7SHong Zhang     Mat_SeqAIJ           *c=(Mat_SeqAIJ*)(*C)->data;
11714d478ae7SHong Zhang     /* inode causes memory problem, don't know why */
11724d478ae7SHong Zhang     if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
1173e8354b3eSHong Zhang 
1174335efc43SPeter Brune     ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr);
1175335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1176335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1177335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1178335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1179335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1180b9af6bddSHong Zhang     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
11812205254eSKarl Rupp 
118240192850SHong Zhang     abt->matcoloring = matcoloring;
11832205254eSKarl Rupp 
11842128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
11852128a86cSHong Zhang 
11862128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
11872128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
11882128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11892128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
11900298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
11912205254eSKarl Rupp 
11922128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
119340192850SHong Zhang     abt->Bt_den   = Bt_dense;
11942128a86cSHong Zhang 
11952128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
11962128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11972128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
11980298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
11992205254eSKarl Rupp 
12002128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
120140192850SHong Zhang     abt->ABt_den  = C_dense;
1202f94ccd6cSHong Zhang 
1203f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12041ce71dffSSatish Balay     {
1205f94ccd6cSHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
1206c40ebe3bSHong Zhang       ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
12071ce71dffSSatish Balay     }
1208f94ccd6cSHong Zhang #endif
12092128a86cSHong Zhang   }
1210e99be685SHong Zhang   /* clean up */
1211e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1212e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12135df89d91SHong Zhang   PetscFunctionReturn(0);
12145df89d91SHong Zhang }
12155df89d91SHong Zhang 
12166fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12175df89d91SHong Zhang {
12185df89d91SHong Zhang   PetscErrorCode      ierr;
12195df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1220e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
122181d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12225df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1223aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
122440192850SHong Zhang   Mat_MatMatTransMult *abt = c->abt;
12255df89d91SHong Zhang 
12265df89d91SHong Zhang   PetscFunctionBegin;
122758ed3ceaSHong Zhang   /* clear old values in C */
122858ed3ceaSHong Zhang   if (!c->a) {
1229580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
123058ed3ceaSHong Zhang     c->a      = ca;
123158ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
123258ed3ceaSHong Zhang   } else {
123358ed3ceaSHong Zhang     ca =  c->a;
1234580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
123558ed3ceaSHong Zhang   }
123658ed3ceaSHong Zhang 
123740192850SHong Zhang   if (abt->usecoloring) {
123840192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
123940192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1240c8db22f6SHong Zhang 
1241b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
124240192850SHong Zhang     Bt_dense = abt->Bt_den;
1243b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1244c8db22f6SHong Zhang 
1245c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
12462128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1247c8db22f6SHong Zhang 
12482128a86cSHong Zhang     /* Recover C from C_dense */
1249b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1250c8db22f6SHong Zhang     PetscFunctionReturn(0);
1251c8db22f6SHong Zhang   }
1252c8db22f6SHong Zhang 
12531fa1209cSHong Zhang   for (i=0; i<cm; i++) {
125481d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
125581d82fe4SHong Zhang     acol = aj + ai[i];
12566973769bSHong Zhang     aval = aa + ai[i];
12571fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
12581fa1209cSHong Zhang     ccol = cj + ci[i];
12596973769bSHong Zhang     cval = ca + ci[i];
12601fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
126181d82fe4SHong Zhang       brow = ccol[j];
126281d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
126381d82fe4SHong Zhang       bcol = bj + bi[brow];
12646973769bSHong Zhang       bval = ba + bi[brow];
12656973769bSHong Zhang 
126681d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
126781d82fe4SHong Zhang       nexta = 0; nextb = 0;
126881d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
12697b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
127081d82fe4SHong Zhang         if (nexta == anzi) break;
12717b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
127281d82fe4SHong Zhang         if (nextb == bnzj) break;
127381d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
12746973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
127581d82fe4SHong Zhang           nexta++; nextb++;
127681d82fe4SHong Zhang           flops += 2;
12771fa1209cSHong Zhang         }
12781fa1209cSHong Zhang       }
127981d82fe4SHong Zhang     }
128081d82fe4SHong Zhang   }
128181d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
128281d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
128381d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
12845df89d91SHong Zhang   PetscFunctionReturn(0);
12855df89d91SHong Zhang }
12865df89d91SHong Zhang 
12876d373c3eSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
12886d373c3eSHong Zhang {
12896d373c3eSHong Zhang   PetscErrorCode      ierr;
12906d373c3eSHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
12916d373c3eSHong Zhang   Mat_MatTransMatMult *atb = a->atb;
12926d373c3eSHong Zhang 
12936d373c3eSHong Zhang   PetscFunctionBegin;
12946473ade5SStefano Zampini   if (atb) {
12956d373c3eSHong Zhang     ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
12966473ade5SStefano Zampini     ierr = (*atb->destroy)(A);CHKERRQ(ierr);
12976473ade5SStefano Zampini   }
12986d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
12996d373c3eSHong Zhang   PetscFunctionReturn(0);
13006d373c3eSHong Zhang }
13016d373c3eSHong Zhang 
13020adebc6cSBarry Smith PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
13030adebc6cSBarry Smith {
13045df89d91SHong Zhang   PetscErrorCode      ierr;
13056d373c3eSHong Zhang   const char          *algTypes[2] = {"matmatmult","outerproduct"};
13066d373c3eSHong Zhang   PetscInt            alg=0; /* set default algorithm */
13076d373c3eSHong Zhang   Mat                 At;
13086d373c3eSHong Zhang   Mat_MatTransMatMult *atb;
13096d373c3eSHong Zhang   Mat_SeqAIJ          *c;
13105df89d91SHong Zhang 
13115df89d91SHong Zhang   PetscFunctionBegin;
13125df89d91SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
1313715a5346SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
13146d373c3eSHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr);
13156d373c3eSHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
13166d373c3eSHong Zhang 
13176d373c3eSHong Zhang     switch (alg) {
13186d373c3eSHong Zhang     case 1:
131975648e8dSHong Zhang       ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
13206d373c3eSHong Zhang       break;
13216d373c3eSHong Zhang     default:
13226d373c3eSHong Zhang       ierr = PetscNew(&atb);CHKERRQ(ierr);
13236d373c3eSHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
13246d373c3eSHong Zhang       ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
13256d373c3eSHong Zhang 
1326618cf492SHong Zhang       c                  = (Mat_SeqAIJ*)(*C)->data;
13276d373c3eSHong Zhang       c->atb             = atb;
13286d373c3eSHong Zhang       atb->At            = At;
13296d373c3eSHong Zhang       atb->destroy       = (*C)->ops->destroy;
13306d373c3eSHong Zhang       (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
13316d373c3eSHong Zhang 
13326d373c3eSHong Zhang       break;
13335df89d91SHong Zhang     }
13346d373c3eSHong Zhang   }
13356d373c3eSHong Zhang   if (alg) {
13366d373c3eSHong Zhang     ierr = (*(*C)->ops->mattransposemultnumeric)(A,B,*C);CHKERRQ(ierr);
13376d373c3eSHong Zhang   } else if (!alg && scall == MAT_REUSE_MATRIX) {
13386d373c3eSHong Zhang     c   = (Mat_SeqAIJ*)(*C)->data;
13396d373c3eSHong Zhang     atb = c->atb;
13406d373c3eSHong Zhang     At  = atb->At;
13416d373c3eSHong Zhang     ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
13426d373c3eSHong Zhang     ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);CHKERRQ(ierr);
13436d373c3eSHong Zhang   }
13445df89d91SHong Zhang   PetscFunctionReturn(0);
13455df89d91SHong Zhang }
13465df89d91SHong Zhang 
134775648e8dSHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
13485df89d91SHong Zhang {
1349bc011b1eSHong Zhang   PetscErrorCode ierr;
1350bc011b1eSHong Zhang   Mat            At;
135138baddfdSBarry Smith   PetscInt       *ati,*atj;
1352bc011b1eSHong Zhang 
1353bc011b1eSHong Zhang   PetscFunctionBegin;
1354bc011b1eSHong Zhang   /* create symbolic At */
1355bc011b1eSHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13560298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
135733d57670SJed Brown   ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
135804b858e0SBarry Smith   ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1359bc011b1eSHong Zhang 
1360bc011b1eSHong Zhang   /* get symbolic C=At*B */
1361bc011b1eSHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1362bc011b1eSHong Zhang 
1363bc011b1eSHong Zhang   /* clean up */
13646bf464f9SBarry Smith   ierr = MatDestroy(&At);CHKERRQ(ierr);
1365bc011b1eSHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13666d373c3eSHong Zhang 
13676d373c3eSHong Zhang   (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ;
1368bc011b1eSHong Zhang   PetscFunctionReturn(0);
1369bc011b1eSHong Zhang }
1370bc011b1eSHong Zhang 
137175648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1372bc011b1eSHong Zhang {
1373bc011b1eSHong Zhang   PetscErrorCode ierr;
13740fbc74f4SHong Zhang   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1375d0f46423SBarry Smith   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1376d0f46423SBarry Smith   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1377d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1378aa1aec99SHong Zhang   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1379bc011b1eSHong Zhang 
1380bc011b1eSHong Zhang   PetscFunctionBegin;
1381aa1aec99SHong Zhang   if (!c->a) {
1382580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
13832205254eSKarl Rupp 
1384aa1aec99SHong Zhang     c->a      = ca;
1385aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1386aa1aec99SHong Zhang   } else {
1387aa1aec99SHong Zhang     ca   = c->a;
1388580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1389aa1aec99SHong Zhang   }
1390bc011b1eSHong Zhang 
1391bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1392bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1393bc011b1eSHong Zhang     bj   = b->j + bi[i];
1394bc011b1eSHong Zhang     ba   = b->a + bi[i];
1395bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1396bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1397bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1398bc011b1eSHong Zhang       nextb = 0;
13990fbc74f4SHong Zhang       crow  = *aj++;
1400bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1401bc011b1eSHong Zhang       caj   = ca + ci[crow];
1402bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1403bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14040fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14050fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1406bc011b1eSHong Zhang           nextb++;
1407bc011b1eSHong Zhang         }
1408bc011b1eSHong Zhang       }
1409bc011b1eSHong Zhang       flops += 2*bnzi;
14100fbc74f4SHong Zhang       aa++;
1411bc011b1eSHong Zhang     }
1412bc011b1eSHong Zhang   }
1413bc011b1eSHong Zhang 
1414bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1415bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1416bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1417bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1418bc011b1eSHong Zhang   PetscFunctionReturn(0);
1419bc011b1eSHong Zhang }
14209123193aSHong Zhang 
1421150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
14229123193aSHong Zhang {
14239123193aSHong Zhang   PetscErrorCode ierr;
14249123193aSHong Zhang 
14259123193aSHong Zhang   PetscFunctionBegin;
14269123193aSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
14273ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14289123193aSHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
14293ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14309123193aSHong Zhang   }
14313ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14329123193aSHong Zhang   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
14334614e006SHong Zhang   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14349123193aSHong Zhang   PetscFunctionReturn(0);
14359123193aSHong Zhang }
1436edd81eecSMatthew Knepley 
14379123193aSHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
14389123193aSHong Zhang {
14399123193aSHong Zhang   PetscErrorCode ierr;
14409123193aSHong Zhang 
14419123193aSHong Zhang   PetscFunctionBegin;
14425a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14432205254eSKarl Rupp 
1444d73949e8SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14459123193aSHong Zhang   PetscFunctionReturn(0);
14469123193aSHong Zhang }
14479123193aSHong Zhang 
144887753ddeSHong Zhang PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
14499123193aSHong Zhang {
1450f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1451612438f1SStefano Zampini   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
14529123193aSHong Zhang   PetscErrorCode    ierr;
1453*a4af7ca8SStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1454*a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1455daf57211SHong Zhang   const PetscInt    *aj;
1456612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1457daf57211SHong Zhang   PetscInt          am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
14589123193aSHong Zhang 
14599123193aSHong Zhang   PetscFunctionBegin;
1460f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1461*a4af7ca8SStefano Zampini   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
14628c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1463*a4af7ca8SStefano Zampini   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1464f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1465730858b9SHong Zhang   c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1466f32d5d43SBarry Smith   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1467f32d5d43SBarry Smith     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1468f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1469f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1470f32d5d43SBarry Smith       aj = a->j + a->i[i];
1471*a4af7ca8SStefano Zampini       aa = av + a->i[i];
1472f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1473730858b9SHong Zhang         aatmp = aa[j]; ajtmp = aj[j];
1474730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1475730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1476730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1477730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
14789123193aSHong Zhang       }
147987753ddeSHong Zhang       c1[i] += r1;
148087753ddeSHong Zhang       c2[i] += r2;
148187753ddeSHong Zhang       c3[i] += r3;
148287753ddeSHong Zhang       c4[i] += r4;
1483f32d5d43SBarry Smith     }
1484730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1485730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1486f32d5d43SBarry Smith   }
1487f32d5d43SBarry Smith   for (; col<cn; col++) {   /* over extra columns of C */
1488f32d5d43SBarry Smith     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1489f32d5d43SBarry Smith       r1 = 0.0;
1490f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1491f32d5d43SBarry Smith       aj = a->j + a->i[i];
1492*a4af7ca8SStefano Zampini       aa = av + a->i[i];
1493f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1494730858b9SHong Zhang         r1 += aa[j]*b1[aj[j]];
1495f32d5d43SBarry Smith       }
149687753ddeSHong Zhang       c1[i] += r1;
1497f32d5d43SBarry Smith     }
1498f32d5d43SBarry Smith     b1 += bm;
1499730858b9SHong Zhang     c1 += am;
1500f32d5d43SBarry Smith   }
1501dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
15028c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1503*a4af7ca8SStefano Zampini   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1504*a4af7ca8SStefano Zampini   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1505f32d5d43SBarry Smith   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1506f32d5d43SBarry Smith   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1507f32d5d43SBarry Smith   PetscFunctionReturn(0);
1508f32d5d43SBarry Smith }
1509f32d5d43SBarry Smith 
151087753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1511f32d5d43SBarry Smith {
1512f32d5d43SBarry Smith   PetscErrorCode ierr;
1513f32d5d43SBarry Smith 
1514f32d5d43SBarry Smith   PetscFunctionBegin;
151587753ddeSHong Zhang   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
151687753ddeSHong Zhang   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
151787753ddeSHong Zhang   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
15184324174eSBarry Smith 
151987753ddeSHong Zhang   ierr = MatZeroEntries(C);CHKERRQ(ierr);
152087753ddeSHong Zhang   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C);CHKERRQ(ierr);
15219123193aSHong Zhang   PetscFunctionReturn(0);
15229123193aSHong Zhang }
1523b1683b59SHong Zhang 
1524b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1525c8db22f6SHong Zhang {
1526c8db22f6SHong Zhang   PetscErrorCode ierr;
15272f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
15282f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
15292f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
15302f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
15312f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
15322f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1533c8db22f6SHong Zhang 
1534c8db22f6SHong Zhang   PetscFunctionBegin;
15352f5f1f90SHong Zhang   btval_den=btdense->v;
1536580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
15372f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
15382f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
15392f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1540d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
15412f5f1f90SHong Zhang       btcol = bj + bi[col];
15422f5f1f90SHong Zhang       btval = ba + bi[col];
15432f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1544d2853540SHong Zhang       for (j=0; j<anz; j++) {
15452f5f1f90SHong Zhang         brow            = btcol[j];
15462f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1547c8db22f6SHong Zhang       }
1548c8db22f6SHong Zhang     }
15492f5f1f90SHong Zhang     btval_den += m;
1550c8db22f6SHong Zhang   }
1551c8db22f6SHong Zhang   PetscFunctionReturn(0);
1552c8db22f6SHong Zhang }
1553c8db22f6SHong Zhang 
1554b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
15558972f759SHong Zhang {
15568972f759SHong Zhang   PetscErrorCode    ierr;
1557b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
15581683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
15591683a169SBarry Smith   PetscScalar       *ca=csp->a;
1560f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1561e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1562077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1563077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
15648972f759SHong Zhang 
15658972f759SHong Zhang   PetscFunctionBegin;
15661683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1567f99a636bSHong Zhang 
1568077f23c2SHong Zhang   if (brows > 0) {
1569077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1570980ae229SHong Zhang     lstart = matcoloring->lstart;
1571580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1572eeb4fd02SHong Zhang 
1573077f23c2SHong Zhang     row_end = brows;
1574eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1575077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1576077f23c2SHong Zhang       ca_den_ptr = ca_den;
1577980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1578eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1579eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1580eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1581eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1582eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1583eeb4fd02SHong Zhang             lstart[k] = l;
1584eeb4fd02SHong Zhang             break;
1585eeb4fd02SHong Zhang           } else {
1586077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1587eeb4fd02SHong Zhang           }
1588eeb4fd02SHong Zhang         }
1589077f23c2SHong Zhang         ca_den_ptr += m;
1590eeb4fd02SHong Zhang       }
1591077f23c2SHong Zhang       row_end += brows;
1592eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1593eeb4fd02SHong Zhang     }
1594077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1595077f23c2SHong Zhang     ca_den_ptr = ca_den;
1596077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1597077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1598077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1599077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1600077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1601077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1602077f23c2SHong Zhang       }
1603077f23c2SHong Zhang       ca_den_ptr += m;
1604077f23c2SHong Zhang     }
1605f99a636bSHong Zhang   }
1606f99a636bSHong Zhang 
16071683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1608f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1609077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1610f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1611e88777f2SHong Zhang   } else {
1612077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1613e88777f2SHong Zhang   }
1614f99a636bSHong Zhang #endif
16158972f759SHong Zhang   PetscFunctionReturn(0);
16168972f759SHong Zhang }
16178972f759SHong Zhang 
1618b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1619b1683b59SHong Zhang {
1620b1683b59SHong Zhang   PetscErrorCode ierr;
1621e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
16221a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1623b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1624b1683b59SHong Zhang   IS             *isa;
1625b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1626e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1627e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1628e88777f2SHong Zhang   PetscBool      flg;
1629b1683b59SHong Zhang 
1630b1683b59SHong Zhang   PetscFunctionBegin;
1631071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1632e99be685SHong Zhang 
16337ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1634e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1635b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1636e88777f2SHong Zhang   c->N      = Nbs;
1637e88777f2SHong Zhang   c->m      = c->M;
1638b1683b59SHong Zhang   c->rstart = 0;
1639e88777f2SHong Zhang   c->brows  = 100;
1640b1683b59SHong Zhang 
1641b1683b59SHong Zhang   c->ncolors = nis;
1642dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1643854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1644854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1645e88777f2SHong Zhang 
1646e88777f2SHong Zhang   brows = c->brows;
1647c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1648e88777f2SHong Zhang   if (flg) c->brows = brows;
1649eeb4fd02SHong Zhang   if (brows > 0) {
1650854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1651e88777f2SHong Zhang   }
16522205254eSKarl Rupp 
1653d2853540SHong Zhang   colorforrow[0] = 0;
1654d2853540SHong Zhang   rows_i         = rows;
1655f99a636bSHong Zhang   den2sp_i       = den2sp;
1656b1683b59SHong Zhang 
1657854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1658854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
16592205254eSKarl Rupp 
1660d2853540SHong Zhang   colorforcol[0] = 0;
1661d2853540SHong Zhang   columns_i      = columns;
1662d2853540SHong Zhang 
1663077f23c2SHong Zhang   /* get column-wise storage of mat */
1664077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1665b1683b59SHong Zhang 
1666b28d80bdSHong Zhang   cm   = c->m;
1667854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1668854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1669f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1670b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1671b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
16722205254eSKarl Rupp 
1673b1683b59SHong Zhang     c->ncolumns[i] = n;
1674b1683b59SHong Zhang     if (n) {
1675580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1676b1683b59SHong Zhang     }
1677d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1678d2853540SHong Zhang     columns_i       += n;
1679b1683b59SHong Zhang 
1680b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1681580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1682e99be685SHong Zhang 
1683b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1684b1683b59SHong Zhang       col     = is[j];
1685d2853540SHong Zhang       row_idx = cj + ci[col];
1686b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1687b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1688e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1689d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1690b1683b59SHong Zhang       }
1691b1683b59SHong Zhang     }
1692b1683b59SHong Zhang     /* count the number of hits */
1693b1683b59SHong Zhang     nrows = 0;
1694e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1695b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1696b1683b59SHong Zhang     }
1697b1683b59SHong Zhang     c->nrows[i]      = nrows;
1698d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1699d2853540SHong Zhang 
1700b1683b59SHong Zhang     nrows = 0;
1701b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1702b1683b59SHong Zhang       if (rowhit[j]) {
1703d2853540SHong Zhang         rows_i[nrows]   = j;
170412b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1705b1683b59SHong Zhang         nrows++;
1706b1683b59SHong Zhang       }
1707b1683b59SHong Zhang     }
1708e88777f2SHong Zhang     den2sp_i += nrows;
1709077f23c2SHong Zhang 
1710b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1711f99a636bSHong Zhang     rows_i += nrows;
1712b1683b59SHong Zhang   }
17130298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1714b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1715071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1716d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1717b28d80bdSHong Zhang 
1718d2853540SHong Zhang   c->colorforrow = colorforrow;
1719d2853540SHong Zhang   c->rows        = rows;
1720f99a636bSHong Zhang   c->den2sp      = den2sp;
1721d2853540SHong Zhang   c->colorforcol = colorforcol;
1722d2853540SHong Zhang   c->columns     = columns;
17232205254eSKarl Rupp 
1724f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1725b1683b59SHong Zhang   PetscFunctionReturn(0);
1726b1683b59SHong Zhang }
1727d013fc79Sandi selinger 
1728df97dc6dSFande Kong /* The combine method has done the symbolic and numeric in the first phase, and so we just return here */
1729df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,Mat C)
1730df97dc6dSFande Kong {
1731df97dc6dSFande Kong   PetscFunctionBegin;
1732df97dc6dSFande Kong   /* Empty function */
1733df97dc6dSFande Kong   PetscFunctionReturn(0);
1734df97dc6dSFande Kong }
1735df97dc6dSFande Kong 
173673b67375Sandi selinger /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */
1737d013fc79Sandi selinger PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C)
1738d013fc79Sandi selinger {
1739d013fc79Sandi selinger   PetscErrorCode     ierr;
1740d013fc79Sandi selinger   PetscLogDouble     flops=0.0;
1741d013fc79Sandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
17422869b61bSandi selinger   const PetscInt     *ai=a->i,*bi=b->i;
1743d013fc79Sandi selinger   PetscInt           *ci,*cj,*cj_i;
1744d013fc79Sandi selinger   PetscScalar        *ca,*ca_i;
17452869b61bSandi selinger   PetscInt           b_maxmemrow,c_maxmem,a_col;
1746d013fc79Sandi selinger   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1747d013fc79Sandi selinger   PetscInt           i,k,ndouble=0;
1748d013fc79Sandi selinger   PetscReal          afill;
1749d013fc79Sandi selinger   PetscScalar        *c_row_val_dense;
1750d013fc79Sandi selinger   PetscBool          *c_row_idx_flags;
1751d013fc79Sandi selinger   PetscInt           *aj_i=a->j;
1752d013fc79Sandi selinger   PetscScalar        *aa_i=a->a;
1753d013fc79Sandi selinger 
1754d013fc79Sandi selinger   PetscFunctionBegin;
17552869b61bSandi selinger 
17562869b61bSandi selinger   /* Step 1: Determine upper bounds on memory for C and allocate memory */
17572869b61bSandi selinger   /* This should be enough for almost all matrices. If still more memory is needed, it is reallocated later. */
17582869b61bSandi selinger   c_maxmem    = 8*(ai[am]+bi[bm]);
17592869b61bSandi selinger   b_maxmemrow = PetscMin(bi[bm],bn);
1760d013fc79Sandi selinger   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1761580bdb30SBarry Smith   ierr  = PetscCalloc1(bn,&c_row_val_dense);CHKERRQ(ierr);
1762580bdb30SBarry Smith   ierr  = PetscCalloc1(bn,&c_row_idx_flags);CHKERRQ(ierr);
1763d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
1764d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&ca);CHKERRQ(ierr);
1765d013fc79Sandi selinger   ca_i  = ca;
1766d013fc79Sandi selinger   cj_i  = cj;
1767d013fc79Sandi selinger   ci[0] = 0;
1768d013fc79Sandi selinger   for (i=0; i<am; i++) {
1769d013fc79Sandi selinger     /* Step 2: Initialize the dense row vector for C  */
1770d013fc79Sandi selinger     const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1771d013fc79Sandi selinger     PetscInt       cnzi = 0;
1772d013fc79Sandi selinger     PetscInt       *bj_i;
1773d013fc79Sandi selinger     PetscScalar    *ba_i;
17742869b61bSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory
17752869b61bSandi selinger        Usually, there is enough memory in the first place, so this is not executed. */
17762869b61bSandi selinger     while (ci[i] + b_maxmemrow > c_maxmem) {
17772869b61bSandi selinger       c_maxmem *= 2;
17782869b61bSandi selinger       ndouble++;
1779928bb9adSStefano Zampini       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
1780928bb9adSStefano Zampini       ierr = PetscRealloc(sizeof(PetscScalar)*c_maxmem,&ca);CHKERRQ(ierr);
17812869b61bSandi selinger     }
1782d013fc79Sandi selinger 
1783d013fc79Sandi selinger     /* Step 3: Do the numerical calculations */
1784d013fc79Sandi selinger     for (a_col=0; a_col<anzi; a_col++) {          /* iterate over all non zero values in a row of A */
1785d013fc79Sandi selinger       PetscInt       a_col_index = aj_i[a_col];
1786d013fc79Sandi selinger       const PetscInt bnzi        = bi[a_col_index+1] - bi[a_col_index];
1787d013fc79Sandi selinger       flops += 2*bnzi;
1788d013fc79Sandi selinger       bj_i   = b->j + bi[a_col_index];   /* points to the current row in bj */
1789d013fc79Sandi selinger       ba_i   = b->a + bi[a_col_index];   /* points to the current row in ba */
1790d013fc79Sandi selinger       for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */
1791d013fc79Sandi selinger         if (c_row_idx_flags[bj_i[k]] == PETSC_FALSE) {
17922869b61bSandi selinger           cj_i[cnzi++]             = bj_i[k];
1793d013fc79Sandi selinger           c_row_idx_flags[bj_i[k]] = PETSC_TRUE;
1794d013fc79Sandi selinger         }
1795d013fc79Sandi selinger         c_row_val_dense[bj_i[k]] += aa_i[a_col] * ba_i[k];
1796d013fc79Sandi selinger       }
1797d013fc79Sandi selinger     }
1798d013fc79Sandi selinger 
1799d013fc79Sandi selinger     /* Sort array */
18003353a62bSKarl Rupp     ierr = PetscSortInt(cnzi,cj_i);CHKERRQ(ierr);
1801d013fc79Sandi selinger     /* Step 4 */
1802d013fc79Sandi selinger     for (k=0; k<cnzi; k++) {
1803d013fc79Sandi selinger       ca_i[k]                  = c_row_val_dense[cj_i[k]];
1804d013fc79Sandi selinger       c_row_val_dense[cj_i[k]] = 0.;
1805d013fc79Sandi selinger       c_row_idx_flags[cj_i[k]] = PETSC_FALSE;
1806d013fc79Sandi selinger     }
1807d013fc79Sandi selinger     /* terminate current row */
1808d013fc79Sandi selinger     aa_i   += anzi;
1809d013fc79Sandi selinger     aj_i   += anzi;
1810d013fc79Sandi selinger     ca_i   += cnzi;
1811d013fc79Sandi selinger     cj_i   += cnzi;
1812d013fc79Sandi selinger     ci[i+1] = ci[i] + cnzi;
1813d013fc79Sandi selinger     flops  += cnzi;
1814d013fc79Sandi selinger   }
1815d013fc79Sandi selinger 
1816d013fc79Sandi selinger   /* Step 5 */
1817d013fc79Sandi selinger   /* Create the new matrix */
1818d013fc79Sandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
1819d013fc79Sandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
182002fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1821d013fc79Sandi selinger 
1822d013fc79Sandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1823d013fc79Sandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1824d013fc79Sandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
1825d013fc79Sandi selinger   c->a       = ca;
1826d013fc79Sandi selinger   c->free_a  = PETSC_TRUE;
1827d013fc79Sandi selinger   c->free_ij = PETSC_TRUE;
1828d013fc79Sandi selinger   c->nonew   = 0;
1829d013fc79Sandi selinger 
1830d013fc79Sandi selinger   /* set MatInfo */
1831d013fc79Sandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1832d013fc79Sandi selinger   if (afill < 1.0) afill = 1.0;
1833d013fc79Sandi selinger   c->maxnz                     = ci[am];
1834d013fc79Sandi selinger   c->nz                        = ci[am];
1835d013fc79Sandi selinger   (*C)->info.mallocs           = ndouble;
1836d013fc79Sandi selinger   (*C)->info.fill_ratio_given  = fill;
1837d013fc79Sandi selinger   (*C)->info.fill_ratio_needed = afill;
1838df97dc6dSFande Kong   (*C)->ops->matmultnumeric    = MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined;
1839d013fc79Sandi selinger 
184073b67375Sandi selinger   ierr = PetscFree(c_row_val_dense);CHKERRQ(ierr);
184173b67375Sandi selinger   ierr = PetscFree(c_row_idx_flags);CHKERRQ(ierr);
1842d013fc79Sandi selinger 
1843d013fc79Sandi selinger   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1844d013fc79Sandi selinger   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1845d013fc79Sandi selinger   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1846d013fc79Sandi selinger   PetscFunctionReturn(0);
1847d013fc79Sandi selinger }
1848