xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision b3b4fc5a114eeab53373cded8564dfacad3aa869)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/utils/petscheap.h>
10 #include <petscbt.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
17 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
18 {
19   PetscErrorCode ierr;
20   PetscBool      scalable=PETSC_FALSE,scalable_fast=PETSC_FALSE,heap = PETSC_FALSE,btheap = PETSC_FALSE,llcondensed = PETSC_FALSE;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX) {
24     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
25     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,NULL);CHKERRQ(ierr);
26     ierr = PetscOptionsBool("-matmatmult_scalable_fast","Use a scalable but slower C=A*B","",scalable_fast,&scalable_fast,NULL);CHKERRQ(ierr);
27     ierr = PetscOptionsBool("-matmatmult_heap","Use heap implementation of symbolic factorization C=A*B","",heap,&heap,NULL);CHKERRQ(ierr);
28     ierr = PetscOptionsBool("-matmatmult_btheap","Use btheap implementation of symbolic factorization C=A*B","",btheap,&btheap,NULL);CHKERRQ(ierr);
29     ierr = PetscOptionsBool("-matmatmult_llcondensed","Use LLCondensed to for symbolic C=A*B","",llcondensed,&llcondensed,NULL);CHKERRQ(ierr);
30     ierr = PetscOptionsEnd();CHKERRQ(ierr);
31     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
32     if (scalable_fast) {
33       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
34     } else if (scalable) {
35       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
36     } else if (heap) {
37       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
38     } else if (btheap) {
39       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
40     } else if (llcondensed) {
41       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
42     } else {
43       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
44     }
45     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
46   }
47 
48   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
49   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
50   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed"
56 static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
57 {
58   PetscErrorCode     ierr;
59   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
60   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
61   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
62   PetscReal          afill;
63   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
64   PetscBT            lnkbt;
65   PetscFreeSpaceList free_space=NULL,current_space=NULL;
66 
67   PetscFunctionBegin;
68   /* Get ci and cj */
69   /*---------------*/
70   /* Allocate ci array, arrays for fill computation and */
71   /* free space for accumulating nonzero column info */
72   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
73   ci[0] = 0;
74 
75   /* create and initialize a linked list */
76   nlnk_max = a->rmax*b->rmax;
77   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
78   ierr = PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);CHKERRQ(ierr);
79 
80   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
81   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
82 
83   current_space = free_space;
84 
85   /* Determine ci and cj */
86   for (i=0; i<am; i++) {
87     anzi = ai[i+1] - ai[i];
88     aj   = a->j + ai[i];
89     for (j=0; j<anzi; j++) {
90       brow = aj[j];
91       bnzj = bi[brow+1] - bi[brow];
92       bj   = b->j + bi[brow];
93       /* add non-zero cols of B into the sorted linked list lnk */
94       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
95     }
96     cnzi = lnk[0];
97 
98     /* If free space is not available, make more free space */
99     /* Double the amount of total space in the list */
100     if (current_space->local_remaining<cnzi) {
101       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
102       ndouble++;
103     }
104 
105     /* Copy data into free space, then initialize lnk */
106     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
107 
108     current_space->array           += cnzi;
109     current_space->local_used      += cnzi;
110     current_space->local_remaining -= cnzi;
111 
112     ci[i+1] = ci[i] + cnzi;
113   }
114 
115   /* Column indices are in the list of free space */
116   /* Allocate space for cj, initialize cj, and */
117   /* destroy list of free space and other temporary array(s) */
118   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
119   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
120   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
121 
122   /* put together the new symbolic matrix */
123   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
124 
125   (*C)->rmap->bs = A->rmap->bs;
126   (*C)->cmap->bs = B->cmap->bs;
127 
128   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
129   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
130   c                         = (Mat_SeqAIJ*)((*C)->data);
131   c->free_a                 = PETSC_FALSE;
132   c->free_ij                = PETSC_TRUE;
133   c->nonew                  = 0;
134   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
135 
136   /* set MatInfo */
137   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
138   if (afill < 1.0) afill = 1.0;
139   c->maxnz                     = ci[am];
140   c->nz                        = ci[am];
141   (*C)->info.mallocs           = ndouble;
142   (*C)->info.fill_ratio_given  = fill;
143   (*C)->info.fill_ratio_needed = afill;
144 
145 #if defined(PETSC_USE_INFO)
146   if (ci[am]) {
147     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
148     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
149   } else {
150     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
151   }
152 #endif
153   PetscFunctionReturn(0);
154 }
155 
156 #undef __FUNCT__
157 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
158 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
159 {
160   PetscErrorCode ierr;
161   PetscLogDouble flops=0.0;
162   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
163   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
164   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
165   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
166   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
167   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
168   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
169   PetscScalar    *ab_dense;
170 
171   PetscFunctionBegin;
172   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
173     ierr      = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
174     c->a      = ca;
175     c->free_a = PETSC_TRUE;
176 
177     ierr = PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);CHKERRQ(ierr);
178     ierr = PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
179 
180     c->matmult_abdense = ab_dense;
181   } else {
182     ca       = c->a;
183     ab_dense = c->matmult_abdense;
184   }
185 
186   /* clean old values in C */
187   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
188   /* Traverse A row-wise. */
189   /* Build the ith row in C by summing over nonzero columns in A, */
190   /* the rows of B corresponding to nonzeros of A. */
191   for (i=0; i<am; i++) {
192     anzi = ai[i+1] - ai[i];
193     for (j=0; j<anzi; j++) {
194       brow = aj[j];
195       bnzi = bi[brow+1] - bi[brow];
196       bjj  = bj + bi[brow];
197       baj  = ba + bi[brow];
198       /* perform dense axpy */
199       valtmp = aa[j];
200       for (k=0; k<bnzi; k++) {
201         ab_dense[bjj[k]] += valtmp*baj[k];
202       }
203       flops += 2*bnzi;
204     }
205     aj += anzi; aa += anzi;
206 
207     cnzi = ci[i+1] - ci[i];
208     for (k=0; k<cnzi; k++) {
209       ca[k]          += ab_dense[cj[k]];
210       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
211     }
212     flops += cnzi;
213     cj    += cnzi; ca += cnzi;
214   }
215   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
216   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
217   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
218   PetscFunctionReturn(0);
219 }
220 
221 #undef __FUNCT__
222 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
223 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
224 {
225   PetscErrorCode ierr;
226   PetscLogDouble flops=0.0;
227   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
228   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
229   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
230   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
231   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
232   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
233   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
234   PetscInt       nextb;
235 
236   PetscFunctionBegin;
237   /* clean old values in C */
238   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
239   /* Traverse A row-wise. */
240   /* Build the ith row in C by summing over nonzero columns in A, */
241   /* the rows of B corresponding to nonzeros of A. */
242   for (i=0; i<am; i++) {
243     anzi = ai[i+1] - ai[i];
244     cnzi = ci[i+1] - ci[i];
245     for (j=0; j<anzi; j++) {
246       brow = aj[j];
247       bnzi = bi[brow+1] - bi[brow];
248       bjj  = bj + bi[brow];
249       baj  = ba + bi[brow];
250       /* perform sparse axpy */
251       valtmp = aa[j];
252       nextb  = 0;
253       for (k=0; nextb<bnzi; k++) {
254         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
255           ca[k] += valtmp*baj[nextb++];
256         }
257       }
258       flops += 2*bnzi;
259     }
260     aj += anzi; aa += anzi;
261     cj += cnzi; ca += cnzi;
262   }
263 
264   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
265   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
266   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
267   PetscFunctionReturn(0);
268 }
269 
270 #undef __FUNCT__
271 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast"
272 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
273 {
274   PetscErrorCode     ierr;
275   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
276   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
277   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
278   MatScalar          *ca;
279   PetscReal          afill;
280   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
281   PetscFreeSpaceList free_space=NULL,current_space=NULL;
282 
283   PetscFunctionBegin;
284   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
285   /*-----------------------------------------------------------------------------------------*/
286   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
287   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
288   ci[0] = 0;
289 
290   /* create and initialize a linked list */
291   nlnk_max = a->rmax*b->rmax;
292   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
293   ierr = PetscLLCondensedCreate_fast(nlnk_max,&lnk);CHKERRQ(ierr);
294 
295   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
296   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
297   current_space = free_space;
298 
299   /* Determine ci and cj */
300   for (i=0; i<am; i++) {
301     anzi = ai[i+1] - ai[i];
302     aj   = a->j + ai[i];
303     for (j=0; j<anzi; j++) {
304       brow = aj[j];
305       bnzj = bi[brow+1] - bi[brow];
306       bj   = b->j + bi[brow];
307       /* add non-zero cols of B into the sorted linked list lnk */
308       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
309     }
310     cnzi = lnk[1];
311 
312     /* If free space is not available, make more free space */
313     /* Double the amount of total space in the list */
314     if (current_space->local_remaining<cnzi) {
315       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
316       ndouble++;
317     }
318 
319     /* Copy data into free space, then initialize lnk */
320     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
321 
322     current_space->array           += cnzi;
323     current_space->local_used      += cnzi;
324     current_space->local_remaining -= cnzi;
325 
326     ci[i+1] = ci[i] + cnzi;
327   }
328 
329   /* Column indices are in the list of free space */
330   /* Allocate space for cj, initialize cj, and */
331   /* destroy list of free space and other temporary array(s) */
332   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
333   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
334   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
335 
336   /* Allocate space for ca */
337   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
338   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
339 
340   /* put together the new symbolic matrix */
341   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
342 
343   (*C)->rmap->bs = A->rmap->bs;
344   (*C)->cmap->bs = B->cmap->bs;
345 
346   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
347   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
348   c          = (Mat_SeqAIJ*)((*C)->data);
349   c->free_a  = PETSC_TRUE;
350   c->free_ij = PETSC_TRUE;
351   c->nonew   = 0;
352 
353   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
354 
355   /* set MatInfo */
356   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
357   if (afill < 1.0) afill = 1.0;
358   c->maxnz                     = ci[am];
359   c->nz                        = ci[am];
360   (*C)->info.mallocs           = ndouble;
361   (*C)->info.fill_ratio_given  = fill;
362   (*C)->info.fill_ratio_needed = afill;
363 
364 #if defined(PETSC_USE_INFO)
365   if (ci[am]) {
366     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
367     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
368   } else {
369     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
370   }
371 #endif
372   PetscFunctionReturn(0);
373 }
374 
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
378 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
379 {
380   PetscErrorCode     ierr;
381   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
382   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
383   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
384   MatScalar          *ca;
385   PetscReal          afill;
386   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
387   PetscFreeSpaceList free_space=NULL,current_space=NULL;
388 
389   PetscFunctionBegin;
390   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
391   /*---------------------------------------------------------------------------------------------*/
392   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
393   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
394   ci[0] = 0;
395 
396   /* create and initialize a linked list */
397   nlnk_max = a->rmax*b->rmax;
398   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
399   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
400 
401   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
402   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
403   current_space = free_space;
404 
405   /* Determine ci and cj */
406   for (i=0; i<am; i++) {
407     anzi = ai[i+1] - ai[i];
408     aj   = a->j + ai[i];
409     for (j=0; j<anzi; j++) {
410       brow = aj[j];
411       bnzj = bi[brow+1] - bi[brow];
412       bj   = b->j + bi[brow];
413       /* add non-zero cols of B into the sorted linked list lnk */
414       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
415     }
416     cnzi = lnk[0];
417 
418     /* If free space is not available, make more free space */
419     /* Double the amount of total space in the list */
420     if (current_space->local_remaining<cnzi) {
421       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
422       ndouble++;
423     }
424 
425     /* Copy data into free space, then initialize lnk */
426     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
427 
428     current_space->array           += cnzi;
429     current_space->local_used      += cnzi;
430     current_space->local_remaining -= cnzi;
431 
432     ci[i+1] = ci[i] + cnzi;
433   }
434 
435   /* Column indices are in the list of free space */
436   /* Allocate space for cj, initialize cj, and */
437   /* destroy list of free space and other temporary array(s) */
438   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
439   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
440   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
441 
442   /* Allocate space for ca */
443   /*-----------------------*/
444   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
445   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
446 
447   /* put together the new symbolic matrix */
448   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
449 
450   (*C)->rmap->bs = A->rmap->bs;
451   (*C)->cmap->bs = B->cmap->bs;
452 
453   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
454   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
455   c          = (Mat_SeqAIJ*)((*C)->data);
456   c->free_a  = PETSC_TRUE;
457   c->free_ij = PETSC_TRUE;
458   c->nonew   = 0;
459 
460   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
461 
462   /* set MatInfo */
463   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
464   if (afill < 1.0) afill = 1.0;
465   c->maxnz                     = ci[am];
466   c->nz                        = ci[am];
467   (*C)->info.mallocs           = ndouble;
468   (*C)->info.fill_ratio_given  = fill;
469   (*C)->info.fill_ratio_needed = afill;
470 
471 #if defined(PETSC_USE_INFO)
472   if (ci[am]) {
473     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
474     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
475   } else {
476     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
477   }
478 #endif
479   PetscFunctionReturn(0);
480 }
481 
482 #undef __FUNCT__
483 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap"
484 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
485 {
486   PetscErrorCode     ierr;
487   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
488   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
489   PetscInt           *ci,*cj,*bb;
490   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
491   PetscReal          afill;
492   PetscInt           i,j,col,ndouble = 0;
493   PetscFreeSpaceList free_space=NULL,current_space=NULL;
494   PetscHeap          h;
495 
496   PetscFunctionBegin;
497   /* Get ci and cj - by merging sorted rows using a heap */
498   /*---------------------------------------------------------------------------------------------*/
499   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
500   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
501   ci[0] = 0;
502 
503   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
504   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
505   current_space = free_space;
506 
507   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
508   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
509 
510   /* Determine ci and cj */
511   for (i=0; i<am; i++) {
512     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
513     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
514     ci[i+1] = ci[i];
515     /* Populate the min heap */
516     for (j=0; j<anzi; j++) {
517       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
518       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
519         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
520       }
521     }
522     /* Pick off the min element, adding it to free space */
523     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
524     while (j >= 0) {
525       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
526         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
527         ndouble++;
528       }
529       *(current_space->array++) = col;
530       current_space->local_used++;
531       current_space->local_remaining--;
532       ci[i+1]++;
533 
534       /* stash if anything else remains in this row of B */
535       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
536       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
537         PetscInt j2,col2;
538         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
539         if (col2 != col) break;
540         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
541         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
542       }
543       /* Put any stashed elements back into the min heap */
544       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
545       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
546     }
547   }
548   ierr = PetscFree(bb);CHKERRQ(ierr);
549   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
550 
551   /* Column indices are in the list of free space */
552   /* Allocate space for cj, initialize cj, and */
553   /* destroy list of free space and other temporary array(s) */
554   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
555   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
556 
557   /* put together the new symbolic matrix */
558   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
559 
560   (*C)->rmap->bs = A->rmap->bs;
561   (*C)->cmap->bs = B->cmap->bs;
562 
563   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
564   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
565   c          = (Mat_SeqAIJ*)((*C)->data);
566   c->free_a  = PETSC_TRUE;
567   c->free_ij = PETSC_TRUE;
568   c->nonew   = 0;
569 
570   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
571 
572   /* set MatInfo */
573   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
574   if (afill < 1.0) afill = 1.0;
575   c->maxnz                     = ci[am];
576   c->nz                        = ci[am];
577   (*C)->info.mallocs           = ndouble;
578   (*C)->info.fill_ratio_given  = fill;
579   (*C)->info.fill_ratio_needed = afill;
580 
581 #if defined(PETSC_USE_INFO)
582   if (ci[am]) {
583     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
584     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
585   } else {
586     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
587   }
588 #endif
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNCT__
593 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap"
594 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
595 {
596   PetscErrorCode     ierr;
597   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
598   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
599   PetscInt           *ci,*cj,*bb;
600   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
601   PetscReal          afill;
602   PetscInt           i,j,col,ndouble = 0;
603   PetscFreeSpaceList free_space=NULL,current_space=NULL;
604   PetscHeap          h;
605   PetscBT            bt;
606 
607   PetscFunctionBegin;
608   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
609   /*---------------------------------------------------------------------------------------------*/
610   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
611   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
612   ci[0] = 0;
613 
614   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
615   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
616 
617   current_space = free_space;
618 
619   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
620   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
621   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
622 
623   /* Determine ci and cj */
624   for (i=0; i<am; i++) {
625     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
626     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
627     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
628     ci[i+1] = ci[i];
629     /* Populate the min heap */
630     for (j=0; j<anzi; j++) {
631       PetscInt brow = acol[j];
632       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
633         PetscInt bcol = bj[bb[j]];
634         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
635           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
636           bb[j]++;
637           break;
638         }
639       }
640     }
641     /* Pick off the min element, adding it to free space */
642     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
643     while (j >= 0) {
644       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
645         fptr = NULL;                      /* need PetscBTMemzero */
646         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
647         ndouble++;
648       }
649       *(current_space->array++) = col;
650       current_space->local_used++;
651       current_space->local_remaining--;
652       ci[i+1]++;
653 
654       /* stash if anything else remains in this row of B */
655       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
656         PetscInt bcol = bj[bb[j]];
657         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
658           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
659           bb[j]++;
660           break;
661         }
662       }
663       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
664     }
665     if (fptr) {                 /* Clear the bits for this row */
666       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
667     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
668       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
669     }
670   }
671   ierr = PetscFree(bb);CHKERRQ(ierr);
672   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
673   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
674 
675   /* Column indices are in the list of free space */
676   /* Allocate space for cj, initialize cj, and */
677   /* destroy list of free space and other temporary array(s) */
678   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
679   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
680 
681   /* put together the new symbolic matrix */
682   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
683 
684   (*C)->rmap->bs = A->rmap->bs;
685   (*C)->cmap->bs = B->cmap->bs;
686 
687   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
688   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
689   c          = (Mat_SeqAIJ*)((*C)->data);
690   c->free_a  = PETSC_TRUE;
691   c->free_ij = PETSC_TRUE;
692   c->nonew   = 0;
693 
694   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
695 
696   /* set MatInfo */
697   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
698   if (afill < 1.0) afill = 1.0;
699   c->maxnz                     = ci[am];
700   c->nz                        = ci[am];
701   (*C)->info.mallocs           = ndouble;
702   (*C)->info.fill_ratio_given  = fill;
703   (*C)->info.fill_ratio_needed = afill;
704 
705 #if defined(PETSC_USE_INFO)
706   if (ci[am]) {
707     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
708     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
709   } else {
710     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
711   }
712 #endif
713   PetscFunctionReturn(0);
714 }
715 
716 #undef __FUNCT__
717 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
718 /* concatenate unique entries and then sort */
719 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
720 {
721   PetscErrorCode     ierr;
722   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
723   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
724   PetscInt           *ci,*cj;
725   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
726   PetscReal          afill;
727   PetscInt           i,j,ndouble = 0;
728   PetscSegBuffer     seg,segrow;
729   char               *seen;
730 
731   PetscFunctionBegin;
732   ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
733   ci[0] = 0;
734 
735   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
736   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
737   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
738   ierr = PetscMalloc(bn*sizeof(char),&seen);CHKERRQ(ierr);
739   ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr);
740 
741   /* Determine ci and cj */
742   for (i=0; i<am; i++) {
743     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
744     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
745     PetscInt packlen = 0,*PETSC_RESTRICT crow;
746     /* Pack segrow */
747     for (j=0; j<anzi; j++) {
748       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
749       for (k=bjstart; k<bjend; k++) {
750         PetscInt bcol = bj[k];
751         if (!seen[bcol]) { /* new entry */
752           PetscInt *PETSC_RESTRICT slot;
753           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
754           *slot = bcol;
755           seen[bcol] = 1;
756           packlen++;
757         }
758       }
759     }
760     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
761     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
762     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
763     ci[i+1] = ci[i] + packlen;
764     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
765   }
766   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
767   ierr = PetscFree(seen);CHKERRQ(ierr);
768 
769   /* Column indices are in the segmented buffer */
770   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
771   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
772 
773   /* put together the new symbolic matrix */
774   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
775 
776   (*C)->rmap->bs = A->rmap->bs;
777   (*C)->cmap->bs = B->cmap->bs;
778 
779   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
780   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
781   c          = (Mat_SeqAIJ*)((*C)->data);
782   c->free_a  = PETSC_TRUE;
783   c->free_ij = PETSC_TRUE;
784   c->nonew   = 0;
785 
786   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
787 
788   /* set MatInfo */
789   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
790   if (afill < 1.0) afill = 1.0;
791   c->maxnz                     = ci[am];
792   c->nz                        = ci[am];
793   (*C)->info.mallocs           = ndouble;
794   (*C)->info.fill_ratio_given  = fill;
795   (*C)->info.fill_ratio_needed = afill;
796 
797 #if defined(PETSC_USE_INFO)
798   if (ci[am]) {
799     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
800     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
801   } else {
802     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
803   }
804 #endif
805   PetscFunctionReturn(0);
806 }
807 
808 /* This routine is not used. Should be removed! */
809 #undef __FUNCT__
810 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
811 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
812 {
813   PetscErrorCode ierr;
814 
815   PetscFunctionBegin;
816   if (scall == MAT_INITIAL_MATRIX) {
817     ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
818     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
819     ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
820   }
821   ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
822   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
823   ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
824   PetscFunctionReturn(0);
825 }
826 
827 #undef __FUNCT__
828 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
829 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
830 {
831   PetscErrorCode      ierr;
832   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
833 
834   PetscFunctionBegin;
835   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
836   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
837   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
838   ierr = PetscFree(multtrans);CHKERRQ(ierr);
839   PetscFunctionReturn(0);
840 }
841 
842 #undef __FUNCT__
843 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
844 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
845 {
846   PetscErrorCode      ierr;
847   PetscContainer      container;
848   Mat_MatMatTransMult *multtrans=NULL;
849 
850   PetscFunctionBegin;
851   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject*)&container);CHKERRQ(ierr);
852   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
853   ierr = PetscContainerGetPointer(container,(void**)&multtrans);CHKERRQ(ierr);
854 
855   A->ops->destroy = multtrans->destroy;
856   if (A->ops->destroy) {
857     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
858   }
859   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
860   PetscFunctionReturn(0);
861 }
862 
863 #undef __FUNCT__
864 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
865 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
866 {
867   PetscErrorCode      ierr;
868   Mat                 Bt;
869   PetscInt            *bti,*btj;
870   Mat_MatMatTransMult *multtrans;
871   PetscContainer      container;
872 
873   PetscFunctionBegin;
874   /* create symbolic Bt */
875   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
876   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
877 
878   Bt->rmap->bs = A->cmap->bs;
879   Bt->cmap->bs = B->cmap->bs;
880 
881   /* get symbolic C=A*Bt */
882   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
883 
884   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
885   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
886 
887   /* attach the supporting struct to C */
888   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
889   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
890   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
891   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
892   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
893 
894   multtrans->usecoloring = PETSC_FALSE;
895   multtrans->destroy     = (*C)->ops->destroy;
896   (*C)->ops->destroy     = MatDestroy_SeqAIJ_MatMatMultTrans;
897 
898   ierr = PetscOptionsGetBool(NULL,"-matmattransmult_color",&multtrans->usecoloring,NULL);CHKERRQ(ierr);
899   if (multtrans->usecoloring) {
900     /* Create MatTransposeColoring from symbolic C=A*B^T */
901     MatTransposeColoring matcoloring;
902     ISColoring           iscoloring;
903     Mat                  Bt_dense,C_dense;
904 
905     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
906     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
907 
908     multtrans->matcoloring = matcoloring;
909 
910     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
911 
912     /* Create Bt_dense and C_dense = A*Bt_dense */
913     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
914     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
915     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
916     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
917 
918     Bt_dense->assembled = PETSC_TRUE;
919     multtrans->Bt_den   = Bt_dense;
920 
921     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
922     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
923     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
924     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
925 
926     Bt_dense->assembled = PETSC_TRUE;
927     multtrans->ABt_den  = C_dense;
928 
929 #if defined(PETSC_USE_INFO)
930     {
931       Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
932       ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
933     }
934 #endif
935   }
936   /* clean up */
937   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
938   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
939   PetscFunctionReturn(0);
940 }
941 
942 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
943 #undef __FUNCT__
944 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
945 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
946 {
947   PetscErrorCode      ierr;
948   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
949   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
950   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
951   PetscLogDouble      flops=0.0;
952   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
953   Mat_MatMatTransMult *multtrans;
954   PetscContainer      container;
955 #if defined(USE_ARRAY)
956   MatScalar *spdot;
957 #endif
958 
959   PetscFunctionBegin;
960   /* clear old values in C */
961   if (!c->a) {
962     ierr      = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
963     c->a      = ca;
964     c->free_a = PETSC_TRUE;
965   } else {
966     ca =  c->a;
967   }
968   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
969 
970   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject*)&container);CHKERRQ(ierr);
971   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
972   ierr = PetscContainerGetPointer(container,(void**)&multtrans);CHKERRQ(ierr);
973   if (multtrans->usecoloring) {
974     MatTransposeColoring matcoloring = multtrans->matcoloring;
975     Mat                  Bt_dense;
976     PetscInt             m,n;
977     Mat                  C_dense = multtrans->ABt_den;
978     PetscLogDouble       t0,t1,t2,t3,Bt_den,C_den,C_sp;
979 
980     /* Get Bt_dense by Apply MatTransposeColoring to B */
981     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
982     Bt_dense = multtrans->Bt_den;
983     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
984     ierr = PetscGetTime(&t1);CHKERRQ(ierr);
985     Bt_den = t1 - t0;
986 
987     /* C_dense = A*Bt_dense */
988     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
989     ierr = PetscGetTime(&t2);CHKERRQ(ierr);
990     C_den = t2 - t1;
991 
992     /* Recover C from C_dense */
993     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
994     ierr = PetscGetTime(&t3);CHKERRQ(ierr);
995     C_sp = t3 - t2;
996 #if defined(PETSC_USE_INFO)
997     ierr = PetscInfo4(C,"Coloring A*B^T = Bt_den %g + C_den %g + C_sp %g = %g;\n",Bt_den,C_den,C_sp,Bt_den+C_den+C_sp);CHKERRQ(ierr);
998     ierr     = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
999     ierr = PetscInfo4(C,"Bt_den: %d x %d, B: %d x %d\n",m,n,m,C->cmap->n);CHKERRQ(ierr);
1000 #endif
1001     PetscFunctionReturn(0);
1002   }
1003 
1004 #if defined(USE_ARRAY)
1005   /* allocate an array for implementing sparse inner-product */
1006   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
1007   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
1008 #endif
1009 
1010   for (i=0; i<cm; i++) {
1011     anzi = ai[i+1] - ai[i];
1012     acol = aj + ai[i];
1013     aval = aa + ai[i];
1014     cnzi = ci[i+1] - ci[i];
1015     ccol = cj + ci[i];
1016     cval = ca + ci[i];
1017     for (j=0; j<cnzi; j++) {
1018       brow = ccol[j];
1019       bnzj = bi[brow+1] - bi[brow];
1020       bcol = bj + bi[brow];
1021       bval = ba + bi[brow];
1022 
1023       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1024 #if defined(USE_ARRAY)
1025       /* put ba to spdot array */
1026       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
1027       /* c(i,j)=A[i,:]*B[j,:]^T */
1028       for (nexta=0; nexta<anzi; nexta++) {
1029         cval[j] += spdot[acol[nexta]]*aval[nexta];
1030       }
1031       /* zero spdot array */
1032       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
1033 #else
1034       nexta = 0; nextb = 0;
1035       while (nexta<anzi && nextb<bnzj) {
1036         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1037         if (nexta == anzi) break;
1038         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1039         if (nextb == bnzj) break;
1040         if (acol[nexta] == bcol[nextb]) {
1041           cval[j] += aval[nexta]*bval[nextb];
1042           nexta++; nextb++;
1043           flops += 2;
1044         }
1045       }
1046 #endif
1047     }
1048   }
1049   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1050   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1051   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1052 #if defined(USE_ARRAY)
1053   ierr = PetscFree(spdot);
1054 #endif
1055   PetscFunctionReturn(0);
1056 }
1057 
1058 #undef __FUNCT__
1059 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
1060 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1061 {
1062   PetscErrorCode ierr;
1063 
1064   PetscFunctionBegin;
1065   if (scall == MAT_INITIAL_MATRIX) {
1066     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1067   }
1068   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
1069   PetscFunctionReturn(0);
1070 }
1071 
1072 #undef __FUNCT__
1073 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
1074 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1075 {
1076   PetscErrorCode ierr;
1077   Mat            At;
1078   PetscInt       *ati,*atj;
1079 
1080   PetscFunctionBegin;
1081   /* create symbolic At */
1082   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1083   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
1084 
1085   At->rmap->bs = A->cmap->bs;
1086   At->cmap->bs = B->cmap->bs;
1087 
1088   /* get symbolic C=At*B */
1089   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1090 
1091   /* clean up */
1092   ierr = MatDestroy(&At);CHKERRQ(ierr);
1093   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1094   PetscFunctionReturn(0);
1095 }
1096 
1097 #undef __FUNCT__
1098 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
1099 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1100 {
1101   PetscErrorCode ierr;
1102   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1103   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1104   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1105   PetscLogDouble flops=0.0;
1106   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1107 
1108   PetscFunctionBegin;
1109   if (!c->a) {
1110     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1111 
1112     c->a      = ca;
1113     c->free_a = PETSC_TRUE;
1114   } else {
1115     ca = c->a;
1116   }
1117   /* clear old values in C */
1118   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1119 
1120   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1121   for (i=0; i<am; i++) {
1122     bj   = b->j + bi[i];
1123     ba   = b->a + bi[i];
1124     bnzi = bi[i+1] - bi[i];
1125     anzi = ai[i+1] - ai[i];
1126     for (j=0; j<anzi; j++) {
1127       nextb = 0;
1128       crow  = *aj++;
1129       cjj   = cj + ci[crow];
1130       caj   = ca + ci[crow];
1131       /* perform sparse axpy operation.  Note cjj includes bj. */
1132       for (k=0; nextb<bnzi; k++) {
1133         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1134           caj[k] += (*aa)*(*(ba+nextb));
1135           nextb++;
1136         }
1137       }
1138       flops += 2*bnzi;
1139       aa++;
1140     }
1141   }
1142 
1143   /* Assemble the final matrix and clean up */
1144   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1145   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1146   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 #undef __FUNCT__
1151 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
1152 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1153 {
1154   PetscErrorCode ierr;
1155 
1156   PetscFunctionBegin;
1157   if (scall == MAT_INITIAL_MATRIX) {
1158     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1159     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1160     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1161   }
1162   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1163   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1164   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1165   PetscFunctionReturn(0);
1166 }
1167 
1168 #undef __FUNCT__
1169 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1170 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1171 {
1172   PetscErrorCode ierr;
1173 
1174   PetscFunctionBegin;
1175   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1176 
1177   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1183 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1184 {
1185   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1186   PetscErrorCode ierr;
1187   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1188   MatScalar      *aa;
1189   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
1190   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
1191 
1192   PetscFunctionBegin;
1193   if (!cm || !cn) PetscFunctionReturn(0);
1194   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1195   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1196   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1197   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1198   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1199   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1200   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1201     colam = col*am;
1202     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1203       r1 = r2 = r3 = r4 = 0.0;
1204       n  = a->i[i+1] - a->i[i];
1205       aj = a->j + a->i[i];
1206       aa = a->a + a->i[i];
1207       for (j=0; j<n; j++) {
1208         r1 += (*aa)*b1[*aj];
1209         r2 += (*aa)*b2[*aj];
1210         r3 += (*aa)*b3[*aj];
1211         r4 += (*aa++)*b4[*aj++];
1212       }
1213       c[colam + i]       = r1;
1214       c[colam + am + i]  = r2;
1215       c[colam + am2 + i] = r3;
1216       c[colam + am3 + i] = r4;
1217     }
1218     b1 += bm4;
1219     b2 += bm4;
1220     b3 += bm4;
1221     b4 += bm4;
1222   }
1223   for (; col<cn; col++) {     /* over extra columns of C */
1224     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1225       r1 = 0.0;
1226       n  = a->i[i+1] - a->i[i];
1227       aj = a->j + a->i[i];
1228       aa = a->a + a->i[i];
1229 
1230       for (j=0; j<n; j++) {
1231         r1 += (*aa++)*b1[*aj++];
1232       }
1233       c[col*am + i] = r1;
1234     }
1235     b1 += bm;
1236   }
1237   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1238   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1239   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1240   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1241   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 /*
1246    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1247 */
1248 #undef __FUNCT__
1249 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1250 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1251 {
1252   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1253   PetscErrorCode ierr;
1254   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1255   MatScalar      *aa;
1256   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1257   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1258 
1259   PetscFunctionBegin;
1260   if (!cm || !cn) PetscFunctionReturn(0);
1261   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1262   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1263   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1264 
1265   if (a->compressedrow.use) { /* use compressed row format */
1266     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1267       colam = col*am;
1268       arm   = a->compressedrow.nrows;
1269       ii    = a->compressedrow.i;
1270       ridx  = a->compressedrow.rindex;
1271       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1272         r1 = r2 = r3 = r4 = 0.0;
1273         n  = ii[i+1] - ii[i];
1274         aj = a->j + ii[i];
1275         aa = a->a + ii[i];
1276         for (j=0; j<n; j++) {
1277           r1 += (*aa)*b1[*aj];
1278           r2 += (*aa)*b2[*aj];
1279           r3 += (*aa)*b3[*aj];
1280           r4 += (*aa++)*b4[*aj++];
1281         }
1282         c[colam       + ridx[i]] += r1;
1283         c[colam + am  + ridx[i]] += r2;
1284         c[colam + am2 + ridx[i]] += r3;
1285         c[colam + am3 + ridx[i]] += r4;
1286       }
1287       b1 += bm4;
1288       b2 += bm4;
1289       b3 += bm4;
1290       b4 += bm4;
1291     }
1292     for (; col<cn; col++) {     /* over extra columns of C */
1293       colam = col*am;
1294       arm   = a->compressedrow.nrows;
1295       ii    = a->compressedrow.i;
1296       ridx  = a->compressedrow.rindex;
1297       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1298         r1 = 0.0;
1299         n  = ii[i+1] - ii[i];
1300         aj = a->j + ii[i];
1301         aa = a->a + ii[i];
1302 
1303         for (j=0; j<n; j++) {
1304           r1 += (*aa++)*b1[*aj++];
1305         }
1306         c[colam + ridx[i]] += r1;
1307       }
1308       b1 += bm;
1309     }
1310   } else {
1311     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1312       colam = col*am;
1313       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1314         r1 = r2 = r3 = r4 = 0.0;
1315         n  = a->i[i+1] - a->i[i];
1316         aj = a->j + a->i[i];
1317         aa = a->a + a->i[i];
1318         for (j=0; j<n; j++) {
1319           r1 += (*aa)*b1[*aj];
1320           r2 += (*aa)*b2[*aj];
1321           r3 += (*aa)*b3[*aj];
1322           r4 += (*aa++)*b4[*aj++];
1323         }
1324         c[colam + i]       += r1;
1325         c[colam + am + i]  += r2;
1326         c[colam + am2 + i] += r3;
1327         c[colam + am3 + i] += r4;
1328       }
1329       b1 += bm4;
1330       b2 += bm4;
1331       b3 += bm4;
1332       b4 += bm4;
1333     }
1334     for (; col<cn; col++) {     /* over extra columns of C */
1335       colam = col*am;
1336       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1337         r1 = 0.0;
1338         n  = a->i[i+1] - a->i[i];
1339         aj = a->j + a->i[i];
1340         aa = a->a + a->i[i];
1341 
1342         for (j=0; j<n; j++) {
1343           r1 += (*aa++)*b1[*aj++];
1344         }
1345         c[colam + i] += r1;
1346       }
1347       b1 += bm;
1348     }
1349   }
1350   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1351   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1352   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 #undef __FUNCT__
1357 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1358 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1359 {
1360   PetscErrorCode ierr;
1361   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1362   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1363   PetscInt       *bi      = b->i,*bj=b->j;
1364   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1365   MatScalar      *btval,*btval_den,*ba=b->a;
1366   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1367 
1368   PetscFunctionBegin;
1369   btval_den=btdense->v;
1370   ierr     = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1371   for (k=0; k<ncolors; k++) {
1372     ncolumns = coloring->ncolumns[k];
1373     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1374       col   = *(columns + colorforcol[k] + l);
1375       btcol = bj + bi[col];
1376       btval = ba + bi[col];
1377       anz   = bi[col+1] - bi[col];
1378       for (j=0; j<anz; j++) {
1379         brow            = btcol[j];
1380         btval_den[brow] = btval[j];
1381       }
1382     }
1383     btval_den += m;
1384   }
1385   PetscFunctionReturn(0);
1386 }
1387 
1388 #undef __FUNCT__
1389 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1390 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1391 {
1392   PetscErrorCode ierr;
1393   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1394   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1395   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1396   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1397 
1398   PetscFunctionBegin;
1399   ierr   = MatGetLocalSize(Csp,&m,NULL);CHKERRQ(ierr);
1400   ierr   = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr);
1401   cp_den = ca_den;
1402   for (k=0; k<ncolors; k++) {
1403     nrows = matcoloring->nrows[k];
1404     row   = rows  + colorforrow[k];
1405     idx   = spidx + colorforrow[k];
1406     for (l=0; l<nrows; l++) {
1407       ca[idx[l]] = cp_den[row[l]];
1408     }
1409     cp_den += m;
1410   }
1411   ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 /*
1416  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1417  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1418  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1419  */
1420 #undef __FUNCT__
1421 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
1422 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *nn,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1423 {
1424   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1425   PetscErrorCode ierr;
1426   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1427   PetscInt       nz = a->i[m],row,*jj,mr,col;
1428   PetscInt       *cspidx;
1429 
1430   PetscFunctionBegin;
1431   *nn = n;
1432   if (!ia) PetscFunctionReturn(0);
1433   if (symmetric) {
1434     SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1435     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,(PetscInt**)ia,(PetscInt**)ja);CHKERRQ(ierr);
1436   } else {
1437     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
1438     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1439     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
1440     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
1441     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
1442     jj   = a->j;
1443     for (i=0; i<nz; i++) {
1444       collengths[jj[i]]++;
1445     }
1446     cia[0] = oshift;
1447     for (i=0; i<n; i++) {
1448       cia[i+1] = cia[i] + collengths[i];
1449     }
1450     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1451     jj   = a->j;
1452     for (row=0; row<m; row++) {
1453       mr = a->i[row+1] - a->i[row];
1454       for (i=0; i<mr; i++) {
1455         col = *jj++;
1456 
1457         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1458         cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
1459       }
1460     }
1461     ierr   = PetscFree(collengths);CHKERRQ(ierr);
1462     *ia    = cia; *ja = cja;
1463     *spidx = cspidx;
1464   }
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 #undef __FUNCT__
1469 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1470 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1471 {
1472   PetscErrorCode ierr;
1473 
1474   PetscFunctionBegin;
1475   if (!ia) PetscFunctionReturn(0);
1476 
1477   ierr = PetscFree(*ia);CHKERRQ(ierr);
1478   ierr = PetscFree(*ja);CHKERRQ(ierr);
1479   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 #undef __FUNCT__
1484 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1485 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1486 {
1487   PetscErrorCode ierr;
1488   PetscInt       i,n,nrows,N,j,k,m,ncols,col,cm;
1489   const PetscInt *is,*ci,*cj,*row_idx;
1490   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1491   IS             *isa;
1492   PetscBool      flg1,flg2;
1493   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1494   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1495   PetscInt       *colorforcol,*columns,*columns_i;
1496 
1497   PetscFunctionBegin;
1498   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1499 
1500   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1501   ierr = PetscObjectTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1502   ierr = PetscObjectTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1503   if (flg1 || flg2) {
1504     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1505   }
1506 
1507   N         = mat->cmap->N/bs;
1508   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1509   c->N      = mat->cmap->N/bs;
1510   c->m      = mat->rmap->N/bs;
1511   c->rstart = 0;
1512 
1513   c->ncolors = nis;
1514   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1515   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1516   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1517   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1518 
1519   colorforrow[0]    = 0;
1520   rows_i            = rows;
1521   columnsforspidx_i = columnsforspidx;
1522 
1523   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1524   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1525 
1526   colorforcol[0] = 0;
1527   columns_i      = columns;
1528 
1529   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); /* column-wise storage of mat */
1530 
1531   cm   = c->m;
1532   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1533   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1534   for (i=0; i<nis; i++) {
1535     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1536     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1537 
1538     c->ncolumns[i] = n;
1539     if (n) {
1540       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1541     }
1542     colorforcol[i+1] = colorforcol[i] + n;
1543     columns_i       += n;
1544 
1545     /* fast, crude version requires O(N*N) work */
1546     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1547 
1548     /* loop over columns*/
1549     for (j=0; j<n; j++) {
1550       col     = is[j];
1551       row_idx = cj + ci[col];
1552       m       = ci[col+1] - ci[col];
1553       /* loop over columns marking them in rowhit */
1554       for (k=0; k<m; k++) {
1555         idxhit[*row_idx]   = spidx[ci[col] + k];
1556         rowhit[*row_idx++] = col + 1;
1557       }
1558     }
1559     /* count the number of hits */
1560     nrows = 0;
1561     for (j=0; j<cm; j++) {
1562       if (rowhit[j]) nrows++;
1563     }
1564     c->nrows[i]      = nrows;
1565     colorforrow[i+1] = colorforrow[i] + nrows;
1566 
1567     nrows = 0;
1568     for (j=0; j<cm; j++) {
1569       if (rowhit[j]) {
1570         rows_i[nrows]            = j;
1571         columnsforspidx_i[nrows] = idxhit[j];
1572         nrows++;
1573       }
1574     }
1575     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1576     rows_i += nrows; columnsforspidx_i += nrows;
1577   }
1578   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1579   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1580   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1581 #if defined(PETSC_USE_DEBUG)
1582   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1583 #endif
1584 
1585   c->colorforrow     = colorforrow;
1586   c->rows            = rows;
1587   c->columnsforspidx = columnsforspidx;
1588   c->colorforcol     = colorforcol;
1589   c->columns         = columns;
1590 
1591   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1592   PetscFunctionReturn(0);
1593 }
1594