xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 0ced3a2b8485496f9daf05a2cd5bf7bba7af8937)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/utils/petscheap.h>
10 #include <petscbt.h>
11 #include <../src/mat/impls/dense/seq/dense.h> /*I "petscmat.h" I*/
12 /*
13 #define DEBUG_MATMATMULT
14  */
15 EXTERN_C_BEGIN
16 #undef __FUNCT__
17 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
18 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
19 {
20   PetscErrorCode ierr;
21   PetscBool      scalable=PETSC_FALSE,scalable_fast=PETSC_FALSE,heap = PETSC_FALSE;
22 
23   PetscFunctionBegin;
24   if (scall == MAT_INITIAL_MATRIX){
25     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
26     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
27     ierr = PetscOptionsBool("-matmatmult_scalable_fast","Use a scalable but slower C=A*B","",scalable_fast,&scalable_fast,PETSC_NULL);CHKERRQ(ierr);
28     ierr = PetscOptionsBool("-matmatmult_heap","Use heap implementation of symbolic factorization C=A*B","",heap,&heap,PETSC_NULL);CHKERRQ(ierr);
29     ierr = PetscOptionsEnd();CHKERRQ(ierr);
30     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
31     if (scalable_fast){
32       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
33     } else if (scalable){
34       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
35     } else if (heap) {
36       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
37     } else {
38       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
39     }
40     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
41   }
42   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
43   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
44   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
45   PetscFunctionReturn(0);
46 }
47 EXTERN_C_END
48 
49 #undef __FUNCT__
50 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
51 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
52 {
53   PetscErrorCode     ierr;
54   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
55   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
56   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
57   PetscReal          afill;
58   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
59   PetscBT            lnkbt;
60   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
61 
62   PetscFunctionBegin;
63   /* Get ci and cj */
64   /*---------------*/
65   /* Allocate ci array, arrays for fill computation and */
66   /* free space for accumulating nonzero column info */
67   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
68   ci[0] = 0;
69 
70   /* create and initialize a linked list */
71   nlnk_max = a->rmax*b->rmax;
72   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
73   ierr = PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);CHKERRQ(ierr);
74 
75   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
76   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
77   current_space = free_space;
78 
79   /* Determine ci and cj */
80   for (i=0; i<am; i++) {
81     anzi = ai[i+1] - ai[i];
82     aj   = a->j + ai[i];
83     for (j=0; j<anzi; j++){
84       brow = aj[j];
85       bnzj = bi[brow+1] - bi[brow];
86       bj   = b->j + bi[brow];
87       /* add non-zero cols of B into the sorted linked list lnk */
88       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
89     }
90     cnzi = lnk[0];
91 
92     /* If free space is not available, make more free space */
93     /* Double the amount of total space in the list */
94     if (current_space->local_remaining<cnzi) {
95       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
96       ndouble++;
97     }
98 
99     /* Copy data into free space, then initialize lnk */
100     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
101     current_space->array           += cnzi;
102     current_space->local_used      += cnzi;
103     current_space->local_remaining -= cnzi;
104     ci[i+1] = ci[i] + cnzi;
105   }
106 
107   /* Column indices are in the list of free space */
108   /* Allocate space for cj, initialize cj, and */
109   /* destroy list of free space and other temporary array(s) */
110   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
111   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
112   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
113 
114   /* put together the new symbolic matrix */
115   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
116   (*C)->rmap->bs = A->rmap->bs;
117   (*C)->cmap->bs = B->cmap->bs;
118 
119   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
120   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
121   c = (Mat_SeqAIJ *)((*C)->data);
122   c->free_a  = PETSC_FALSE;
123   c->free_ij = PETSC_TRUE;
124   c->nonew   = 0;
125   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
126 
127   /* set MatInfo */
128   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
129   if (afill < 1.0) afill = 1.0;
130   c->maxnz                     = ci[am];
131   c->nz                        = ci[am];
132   (*C)->info.mallocs           = ndouble;
133   (*C)->info.fill_ratio_given  = fill;
134   (*C)->info.fill_ratio_needed = afill;
135 
136 #if defined(PETSC_USE_INFO)
137   if (ci[am]) {
138     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
139     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
140   } else {
141     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
142   }
143 #endif
144   PetscFunctionReturn(0);
145 }
146 
147 #undef __FUNCT__
148 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
149 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
150 {
151   PetscErrorCode ierr;
152   PetscLogDouble flops=0.0;
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
154   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
155   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
156   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
157   PetscInt       am=A->rmap->n,cm=C->rmap->n;
158   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
159   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
160   PetscScalar    *ab_dense;
161 
162   PetscFunctionBegin;
163   /* printf("MatMatMultNumeric_SeqAIJ_SeqAIJ...ca %p\n",c->a); */
164   if (!c->a){ /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
165     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
166     c->a      = ca;
167     c->free_a = PETSC_TRUE;
168 
169     ierr = PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);CHKERRQ(ierr);
170     ierr = PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
171     c->matmult_abdense = ab_dense;
172   } else {
173     ca       = c->a;
174     ab_dense = c->matmult_abdense;
175   }
176 
177   /* clean old values in C */
178   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
179   /* Traverse A row-wise. */
180   /* Build the ith row in C by summing over nonzero columns in A, */
181   /* the rows of B corresponding to nonzeros of A. */
182   for (i=0; i<am; i++) {
183     anzi = ai[i+1] - ai[i];
184     for (j=0; j<anzi; j++) {
185       brow = aj[j];
186       bnzi = bi[brow+1] - bi[brow];
187       bjj  = bj + bi[brow];
188       baj  = ba + bi[brow];
189       /* perform dense axpy */
190       valtmp = aa[j];
191       for (k=0; k<bnzi; k++) {
192         ab_dense[bjj[k]] += valtmp*baj[k];
193       }
194       flops += 2*bnzi;
195     }
196     aj += anzi; aa += anzi;
197 
198     cnzi = ci[i+1] - ci[i];
199     for (k=0; k<cnzi; k++) {
200       ca[k]          += ab_dense[cj[k]];
201       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
202     }
203     flops += cnzi;
204     cj += cnzi; ca += cnzi;
205   }
206   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
207   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
208   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
209   PetscFunctionReturn(0);
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
214 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
215 {
216   PetscErrorCode ierr;
217   PetscLogDouble flops=0.0;
218   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
219   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
220   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
221   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
222   PetscInt       am=A->rmap->N,cm=C->rmap->N;
223   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
224   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
225   PetscInt       nextb;
226 
227   PetscFunctionBegin;
228 #if defined(DEBUG_MATMATMULT)
229   ierr = PetscPrintf(PETSC_COMM_SELF,"MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable...\n");CHKERRQ(ierr);
230 #endif
231   /* clean old values in C */
232   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
233   /* Traverse A row-wise. */
234   /* Build the ith row in C by summing over nonzero columns in A, */
235   /* the rows of B corresponding to nonzeros of A. */
236   for (i=0;i<am;i++) {
237     anzi = ai[i+1] - ai[i];
238     cnzi = ci[i+1] - ci[i];
239     for (j=0;j<anzi;j++) {
240       brow = aj[j];
241       bnzi = bi[brow+1] - bi[brow];
242       bjj  = bj + bi[brow];
243       baj  = ba + bi[brow];
244       /* perform sparse axpy */
245       valtmp = aa[j];
246       nextb  = 0;
247       for (k=0; nextb<bnzi; k++) {
248         if (cj[k] == bjj[nextb]){ /* ccol == bcol */
249           ca[k] += valtmp*baj[nextb++];
250         }
251       }
252       flops += 2*bnzi;
253     }
254     aj += anzi; aa += anzi;
255     cj += cnzi; ca += cnzi;
256   }
257 
258   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
259   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
260   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 #undef __FUNCT__
265 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast"
266 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
267 {
268   PetscErrorCode     ierr;
269   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
270   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
271   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
272   MatScalar          *ca;
273   PetscReal          afill;
274   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
275   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
276 
277   PetscFunctionBegin;
278   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
279   /*-----------------------------------------------------------------------------------------*/
280   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
281   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
282   ci[0] = 0;
283 
284   /* create and initialize a linked list */
285   nlnk_max = a->rmax*b->rmax;
286   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
287   ierr = PetscLLCondensedCreate_fast(nlnk_max,&lnk);CHKERRQ(ierr);
288 
289   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
290   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
291   current_space = free_space;
292 
293   /* Determine ci and cj */
294   for (i=0; i<am; i++) {
295     anzi = ai[i+1] - ai[i];
296     aj   = a->j + ai[i];
297     for (j=0; j<anzi; j++){
298       brow = aj[j];
299       bnzj = bi[brow+1] - bi[brow];
300       bj   = b->j + bi[brow];
301       /* add non-zero cols of B into the sorted linked list lnk */
302       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
303     }
304     cnzi = lnk[1];
305 
306     /* If free space is not available, make more free space */
307     /* Double the amount of total space in the list */
308     if (current_space->local_remaining<cnzi) {
309       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
310       ndouble++;
311     }
312 
313     /* Copy data into free space, then initialize lnk */
314     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
315     current_space->array           += cnzi;
316     current_space->local_used      += cnzi;
317     current_space->local_remaining -= cnzi;
318     ci[i+1] = ci[i] + cnzi;
319   }
320 
321   /* Column indices are in the list of free space */
322   /* Allocate space for cj, initialize cj, and */
323   /* destroy list of free space and other temporary array(s) */
324   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
325   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
326   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
327 
328   /* Allocate space for ca */
329   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
330   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
331 
332   /* put together the new symbolic matrix */
333   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
334   (*C)->rmap->bs = A->rmap->bs;
335   (*C)->cmap->bs = B->cmap->bs;
336 
337   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
338   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
339   c = (Mat_SeqAIJ *)((*C)->data);
340   c->free_a   = PETSC_TRUE;
341   c->free_ij  = PETSC_TRUE;
342   c->nonew    = 0;
343   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
344 
345   /* set MatInfo */
346   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
347   if (afill < 1.0) afill = 1.0;
348   c->maxnz                     = ci[am];
349   c->nz                        = ci[am];
350   (*C)->info.mallocs           = ndouble;
351   (*C)->info.fill_ratio_given  = fill;
352   (*C)->info.fill_ratio_needed = afill;
353 
354 #if defined(PETSC_USE_INFO)
355   if (ci[am]) {
356     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
357     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
358   } else {
359     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
360   }
361 #endif
362   PetscFunctionReturn(0);
363 }
364 
365 
366 #undef __FUNCT__
367 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
368 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
369 {
370   PetscErrorCode     ierr;
371   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
372   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
373   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
374   MatScalar          *ca;
375   PetscReal          afill;
376   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
377   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
378 
379   PetscFunctionBegin;
380   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
381   /*---------------------------------------------------------------------------------------------*/
382   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
383   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
384   ci[0] = 0;
385 
386   /* create and initialize a linked list */
387   nlnk_max = a->rmax*b->rmax;
388   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
389   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
390 
391   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
392   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
393   current_space = free_space;
394 
395   /* Determine ci and cj */
396   for (i=0; i<am; i++) {
397     anzi = ai[i+1] - ai[i];
398     aj   = a->j + ai[i];
399     for (j=0; j<anzi; j++){
400       brow = aj[j];
401       bnzj = bi[brow+1] - bi[brow];
402       bj   = b->j + bi[brow];
403       /* add non-zero cols of B into the sorted linked list lnk */
404       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
405     }
406     cnzi = lnk[0];
407 
408     /* If free space is not available, make more free space */
409     /* Double the amount of total space in the list */
410     if (current_space->local_remaining<cnzi) {
411       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
412       ndouble++;
413     }
414 
415     /* Copy data into free space, then initialize lnk */
416     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
417     current_space->array           += cnzi;
418     current_space->local_used      += cnzi;
419     current_space->local_remaining -= cnzi;
420     ci[i+1] = ci[i] + cnzi;
421   }
422 
423   /* Column indices are in the list of free space */
424   /* Allocate space for cj, initialize cj, and */
425   /* destroy list of free space and other temporary array(s) */
426   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
427   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
428   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
429 
430   /* Allocate space for ca */
431   /*-----------------------*/
432   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
433   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
434 
435   /* put together the new symbolic matrix */
436   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
437   (*C)->rmap->bs = A->rmap->bs;
438   (*C)->cmap->bs = B->cmap->bs;
439 
440   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
441   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
442   c = (Mat_SeqAIJ *)((*C)->data);
443   c->free_a   = PETSC_TRUE;
444   c->free_ij  = PETSC_TRUE;
445   c->nonew    = 0;
446   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
447 
448   /* set MatInfo */
449   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
450   if (afill < 1.0) afill = 1.0;
451   c->maxnz                     = ci[am];
452   c->nz                        = ci[am];
453   (*C)->info.mallocs           = ndouble;
454   (*C)->info.fill_ratio_given  = fill;
455   (*C)->info.fill_ratio_needed = afill;
456 
457 #if defined(PETSC_USE_INFO)
458   if (ci[am]) {
459     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
460     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
461   } else {
462     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
463   }
464 #endif
465   PetscFunctionReturn(0);
466 }
467 
468 #undef __FUNCT__
469 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap"
470 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
471 {
472   PetscErrorCode     ierr;
473   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
474   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
475   PetscInt           *ci,*cj,*bb;
476   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
477   MatScalar          *ca;
478   PetscReal          afill;
479   PetscInt           i,j,col,ndouble = 0;
480   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
481   PetscHeap          h;
482 
483   PetscFunctionBegin;
484   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
485   /*---------------------------------------------------------------------------------------------*/
486   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
487   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
488   ci[0] = 0;
489 
490   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
491   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
492   current_space = free_space;
493 
494   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
495   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
496 
497   /* Determine ci and cj */
498   for (i=0; i<am; i++) {
499     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
500     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
501     ci[i+1] = ci[i];
502     /* Populate the min heap */
503     for (j=0; j<anzi; j++) {
504       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
505       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
506         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
507       }
508     }
509     /* Pick off the min element, adding it to free space */
510     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
511     while (j >= 0) {
512       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
513         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
514         ndouble++;
515       }
516       *(current_space->array++) = col;
517       current_space->local_used++;
518       current_space->local_remaining--;
519       ci[i+1]++;
520 
521       /* stash if anything else remains in this row of B */
522       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
523       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
524         PetscInt j2,col2;
525         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
526         if (col2 != col) break;
527         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
528         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
529       }
530       /* Put any stashed elements back into the min heap */
531       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
532       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
533     }
534   }
535   ierr = PetscFree(bb);CHKERRQ(ierr);
536   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
537 
538   /* Column indices are in the list of free space */
539   /* Allocate space for cj, initialize cj, and */
540   /* destroy list of free space and other temporary array(s) */
541   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
542   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
543 
544   /* Allocate space for ca */
545   /*-----------------------*/
546   ierr = PetscMalloc(ci[am]*sizeof(MatScalar),&ca);CHKERRQ(ierr);
547   ierr = PetscMemzero(ca,ci[am]*sizeof(MatScalar));CHKERRQ(ierr);
548 
549   /* put together the new symbolic matrix */
550   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
551   (*C)->rmap->bs = A->rmap->bs;
552   (*C)->cmap->bs = B->cmap->bs;
553 
554   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
555   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
556   c = (Mat_SeqAIJ *)((*C)->data);
557   c->free_a   = PETSC_TRUE;
558   c->free_ij  = PETSC_TRUE;
559   c->nonew    = 0;
560   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
561 
562   /* set MatInfo */
563   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
564   if (afill < 1.0) afill = 1.0;
565   c->maxnz                     = ci[am];
566   c->nz                        = ci[am];
567   (*C)->info.mallocs           = ndouble;
568   (*C)->info.fill_ratio_given  = fill;
569   (*C)->info.fill_ratio_needed = afill;
570 
571 #if defined(PETSC_USE_INFO)
572   if (ci[am]) {
573     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
574     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
575   } else {
576     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
577   }
578 #endif
579   PetscFunctionReturn(0);
580 }
581 
582 /* This routine is not used. Should be removed! */
583 #undef __FUNCT__
584 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
585 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
586 {
587   PetscErrorCode ierr;
588 
589   PetscFunctionBegin;
590   if (scall == MAT_INITIAL_MATRIX){
591     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
592   }
593   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
594   PetscFunctionReturn(0);
595 }
596 
597 #undef __FUNCT__
598 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
599 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
600 {
601   PetscErrorCode      ierr;
602   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
603 
604   PetscFunctionBegin;
605   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
606   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
607   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
608   ierr = PetscFree(multtrans);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
614 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
615 {
616   PetscErrorCode      ierr;
617   PetscContainer      container;
618   Mat_MatMatTransMult *multtrans=PETSC_NULL;
619 
620   PetscFunctionBegin;
621   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
622   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
623   ierr = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
624   A->ops->destroy   = multtrans->destroy;
625   if (A->ops->destroy) {
626     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
627   }
628   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
629   PetscFunctionReturn(0);
630 }
631 
632 #undef __FUNCT__
633 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
634 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
635 {
636   PetscErrorCode      ierr;
637   Mat                 Bt;
638   PetscInt            *bti,*btj;
639   Mat_MatMatTransMult *multtrans;
640   PetscContainer      container;
641   PetscLogDouble      t0,tf,etime2=0.0;
642 
643   PetscFunctionBegin;
644   ierr = PetscGetTime(&t0);CHKERRQ(ierr);
645    /* create symbolic Bt */
646   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
647   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);CHKERRQ(ierr);
648   Bt->rmap->bs = A->cmap->bs;
649   Bt->cmap->bs = B->cmap->bs;
650 
651   /* get symbolic C=A*Bt */
652   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
653 
654   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
655   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
656 
657   /* attach the supporting struct to C */
658   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
659   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
660   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
661   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
662   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
663 
664   multtrans->usecoloring = PETSC_FALSE;
665   multtrans->destroy = (*C)->ops->destroy;
666   (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
667 
668   ierr = PetscGetTime(&tf);CHKERRQ(ierr);
669   etime2 += tf - t0;
670 
671   ierr = PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);CHKERRQ(ierr);
672   if (multtrans->usecoloring){
673     /* Create MatTransposeColoring from symbolic C=A*B^T */
674     MatTransposeColoring matcoloring;
675     ISColoring           iscoloring;
676     Mat                  Bt_dense,C_dense;
677     PetscLogDouble       etime0=0.0,etime01=0.0,etime1=0.0;
678 
679     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
680     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
681     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
682     etime0 += tf - t0;
683 
684     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
685     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
686     multtrans->matcoloring = matcoloring;
687     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
688     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
689     etime01 += tf - t0;
690 
691     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
692     /* Create Bt_dense and C_dense = A*Bt_dense */
693     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
694     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
695     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
696     ierr = MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);CHKERRQ(ierr);
697     Bt_dense->assembled = PETSC_TRUE;
698     multtrans->Bt_den = Bt_dense;
699 
700     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
701     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
702     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
703     ierr = MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);CHKERRQ(ierr);
704     Bt_dense->assembled = PETSC_TRUE;
705     multtrans->ABt_den = C_dense;
706     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
707     etime1 += tf - t0;
708 
709 #if defined(PETSC_USE_INFO)
710     {
711     Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
712     ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
713     ierr = PetscInfo5(*C,"Sym = GetColor %g + ColorCreate %g + MatDenseCreate %g + non-colorSym %g = %g\n",etime0,etime01,etime1,etime2,etime0+etime01+etime1+etime2);
714     }
715 #endif
716   }
717   /* clean up */
718   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
719   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
720 
721 
722 
723 #if defined(INEFFICIENT_ALGORITHM)
724   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
725   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
726   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
727   PetscInt           *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
728   PetscInt           am=A->rmap->N,bm=B->rmap->N;
729   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
730   MatScalar          *ca;
731   PetscBT            lnkbt;
732   PetscReal          afill;
733 
734   /* Allocate row pointer array ci  */
735   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
736   ci[0] = 0;
737 
738   /* Create and initialize a linked list for C columns */
739   nlnk = bm+1;
740   ierr = PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
741 
742   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
743   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
744   current_space = free_space;
745 
746   /* Determine symbolic info for each row of the product A*B^T: */
747   for (i=0; i<am; i++) {
748     anzi = ai[i+1] - ai[i];
749     cnzi = 0;
750     acol = aj + ai[i];
751     for (j=0; j<bm; j++){
752       bnzj = bi[j+1] - bi[j];
753       bcol= bj + bi[j];
754       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
755       ka = 0; kb = 0;
756       while (ka < anzi && kb < bnzj){
757         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
758         if (ka == anzi) break;
759         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
760         if (kb == bnzj) break;
761         if (acol[ka] == bcol[kb]){ /* add nonzero c(i,j) to lnk */
762           index[0] = j;
763           ierr = PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
764           cnzi++;
765           break;
766         }
767       }
768     }
769 
770     /* If free space is not available, make more free space */
771     /* Double the amount of total space in the list */
772     if (current_space->local_remaining<cnzi) {
773       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
774       nspacedouble++;
775     }
776 
777     /* Copy data into free space, then initialize lnk */
778     ierr = PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
779     current_space->array           += cnzi;
780     current_space->local_used      += cnzi;
781     current_space->local_remaining -= cnzi;
782 
783     ci[i+1] = ci[i] + cnzi;
784   }
785 
786 
787   /* Column indices are in the list of free space.
788      Allocate array cj, copy column indices to cj, and destroy list of free space */
789   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
790   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
791   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
792 
793   /* Allocate space for ca */
794   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
795   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
796 
797   /* put together the new symbolic matrix */
798   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);CHKERRQ(ierr);
799   (*C)->rmap->bs = A->cmap->bs;
800   (*C)->cmap->bs = B->cmap->bs;
801 
802   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
803   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
804   c = (Mat_SeqAIJ *)((*C)->data);
805   c->free_a   = PETSC_TRUE;
806   c->free_ij  = PETSC_TRUE;
807   c->nonew    = 0;
808 
809   /* set MatInfo */
810   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
811   if (afill < 1.0) afill = 1.0;
812   c->maxnz                     = ci[am];
813   c->nz                        = ci[am];
814   (*C)->info.mallocs           = nspacedouble;
815   (*C)->info.fill_ratio_given  = fill;
816   (*C)->info.fill_ratio_needed = afill;
817 
818 #if defined(PETSC_USE_INFO)
819   if (ci[am]) {
820     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
821     ierr = PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
822   } else {
823     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
824   }
825 #endif
826 #endif
827   PetscFunctionReturn(0);
828 }
829 
830 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
831 #undef __FUNCT__
832 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
833 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
834 {
835   PetscErrorCode ierr;
836   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
837   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
838   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
839   PetscLogDouble flops=0.0;
840   MatScalar      *aa=a->a,*aval,*ba=b->a,*bval,*ca,*cval;
841   Mat_MatMatTransMult *multtrans;
842   PetscContainer      container;
843 #if defined(USE_ARRAY)
844   MatScalar      *spdot;
845 #endif
846 
847   PetscFunctionBegin;
848   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
849   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
850   ierr  = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
851   if (multtrans->usecoloring){
852     MatTransposeColoring  matcoloring = multtrans->matcoloring;
853     Mat                   Bt_dense;
854     PetscInt              m,n;
855     PetscLogDouble t0,tf,etime0=0.0,etime1=0.0,etime2=0.0;
856     Mat C_dense = multtrans->ABt_den;
857 
858     Bt_dense = multtrans->Bt_den;
859     ierr = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
860 
861     /* Get Bt_dense by Apply MatTransposeColoring to B */
862     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
863     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
864     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
865     etime0 += tf - t0;
866 
867     /* C_dense = A*Bt_dense */
868     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
869     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
870     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
871     etime2 += tf - t0;
872 
873     /* Recover C from C_dense */
874     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
875     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
876     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
877     etime1 += tf - t0;
878 #if defined(PETSC_USE_INFO)
879     ierr = PetscInfo4(C,"Num = ColoringApply: %g %g + Mult_sp_dense %g = %g\n",etime0,etime1,etime2,etime0+etime1+etime2);
880 #endif
881     PetscFunctionReturn(0);
882   }
883 
884 #if defined(USE_ARRAY)
885   /* allocate an array for implementing sparse inner-product */
886   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
887   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
888 #endif
889 
890   /* clear old values in C */
891   if (!c->a){
892     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
893     c->a      = ca;
894     c->free_a = PETSC_TRUE;
895   } else {
896     ca =  c->a;
897   }
898   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
899 
900   for (i=0; i<cm; i++) {
901     anzi = ai[i+1] - ai[i];
902     acol = aj + ai[i];
903     aval = aa + ai[i];
904     cnzi = ci[i+1] - ci[i];
905     ccol = cj + ci[i];
906     cval = ca + ci[i];
907     for (j=0; j<cnzi; j++){
908       brow = ccol[j];
909       bnzj = bi[brow+1] - bi[brow];
910       bcol = bj + bi[brow];
911       bval = ba + bi[brow];
912 
913       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
914 #if defined(USE_ARRAY)
915       /* put ba to spdot array */
916       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
917       /* c(i,j)=A[i,:]*B[j,:]^T */
918       for (nexta=0; nexta<anzi; nexta++){
919         cval[j] += spdot[acol[nexta]]*aval[nexta];
920       }
921       /* zero spdot array */
922       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
923 #else
924       nexta = 0; nextb = 0;
925       while (nexta<anzi && nextb<bnzj){
926         while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
927         if (nexta == anzi) break;
928         while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
929         if (nextb == bnzj) break;
930         if (acol[nexta] == bcol[nextb]){
931           cval[j] += aval[nexta]*bval[nextb];
932           nexta++; nextb++;
933           flops += 2;
934         }
935       }
936 #endif
937     }
938   }
939   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
940   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
941   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
942 #if defined(USE_ARRAY)
943   ierr = PetscFree(spdot);
944 #endif
945   PetscFunctionReturn(0);
946 }
947 
948 #undef __FUNCT__
949 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
950 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) {
951   PetscErrorCode ierr;
952 
953   PetscFunctionBegin;
954   if (scall == MAT_INITIAL_MATRIX){
955     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
956   }
957   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
958   PetscFunctionReturn(0);
959 }
960 
961 #undef __FUNCT__
962 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
963 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
964 {
965   PetscErrorCode ierr;
966   Mat            At;
967   PetscInt       *ati,*atj;
968 
969   PetscFunctionBegin;
970   /* create symbolic At */
971   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
972   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);CHKERRQ(ierr);
973   At->rmap->bs = A->cmap->bs;
974   At->cmap->bs = B->cmap->bs;
975 
976   /* get symbolic C=At*B */
977   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
978 
979   /* clean up */
980   ierr = MatDestroy(&At);CHKERRQ(ierr);
981   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
982   PetscFunctionReturn(0);
983 }
984 
985 #undef __FUNCT__
986 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
987 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
988 {
989   PetscErrorCode ierr;
990   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
991   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
992   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
993   PetscLogDouble flops=0.0;
994   MatScalar      *aa=a->a,*ba,*ca,*caj;
995 
996   PetscFunctionBegin;
997   if (!c->a){
998     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
999     c->a      = ca;
1000     c->free_a = PETSC_TRUE;
1001   } else {
1002     ca = c->a;
1003   }
1004   /* clear old values in C */
1005   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1006 
1007   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1008   for (i=0;i<am;i++) {
1009     bj   = b->j + bi[i];
1010     ba   = b->a + bi[i];
1011     bnzi = bi[i+1] - bi[i];
1012     anzi = ai[i+1] - ai[i];
1013     for (j=0; j<anzi; j++) {
1014       nextb = 0;
1015       crow  = *aj++;
1016       cjj   = cj + ci[crow];
1017       caj   = ca + ci[crow];
1018       /* perform sparse axpy operation.  Note cjj includes bj. */
1019       for (k=0; nextb<bnzi; k++) {
1020         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1021           caj[k] += (*aa)*(*(ba+nextb));
1022           nextb++;
1023         }
1024       }
1025       flops += 2*bnzi;
1026       aa++;
1027     }
1028   }
1029 
1030   /* Assemble the final matrix and clean up */
1031   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1032   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1033   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1034   PetscFunctionReturn(0);
1035 }
1036 
1037 EXTERN_C_BEGIN
1038 #undef __FUNCT__
1039 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
1040 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1041 {
1042   PetscErrorCode ierr;
1043 
1044   PetscFunctionBegin;
1045   if (scall == MAT_INITIAL_MATRIX){
1046     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1047   }
1048   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1049   PetscFunctionReturn(0);
1050 }
1051 EXTERN_C_END
1052 
1053 #undef __FUNCT__
1054 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1055 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1056 {
1057   PetscErrorCode ierr;
1058 
1059   PetscFunctionBegin;
1060   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1061   (*C)->ops->matmult = MatMatMult_SeqAIJ_SeqDense;
1062   PetscFunctionReturn(0);
1063 }
1064 
1065 #undef __FUNCT__
1066 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1067 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1068 {
1069   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1070   PetscErrorCode ierr;
1071   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1072   MatScalar      *aa;
1073   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
1074   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
1075 
1076   PetscFunctionBegin;
1077   if (!cm || !cn) PetscFunctionReturn(0);
1078   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1079   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1080   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1081   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
1082   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
1083   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1084   for (col=0; col<cn-4; col += 4){  /* over columns of C */
1085     colam = col*am;
1086     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1087       r1 = r2 = r3 = r4 = 0.0;
1088       n   = a->i[i+1] - a->i[i];
1089       aj  = a->j + a->i[i];
1090       aa  = a->a + a->i[i];
1091       for (j=0; j<n; j++) {
1092         r1 += (*aa)*b1[*aj];
1093         r2 += (*aa)*b2[*aj];
1094         r3 += (*aa)*b3[*aj];
1095         r4 += (*aa++)*b4[*aj++];
1096       }
1097       c[colam + i]       = r1;
1098       c[colam + am + i]  = r2;
1099       c[colam + am2 + i] = r3;
1100       c[colam + am3 + i] = r4;
1101     }
1102     b1 += bm4;
1103     b2 += bm4;
1104     b3 += bm4;
1105     b4 += bm4;
1106   }
1107   for (;col<cn; col++){     /* over extra columns of C */
1108     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1109       r1 = 0.0;
1110       n   = a->i[i+1] - a->i[i];
1111       aj  = a->j + a->i[i];
1112       aa  = a->a + a->i[i];
1113 
1114       for (j=0; j<n; j++) {
1115         r1 += (*aa++)*b1[*aj++];
1116       }
1117       c[col*am + i]     = r1;
1118     }
1119     b1 += bm;
1120   }
1121   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1122   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
1123   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
1124   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1125   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 /*
1130    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1131 */
1132 #undef __FUNCT__
1133 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1134 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1135 {
1136   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1137   PetscErrorCode ierr;
1138   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1139   MatScalar      *aa;
1140   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1141   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1142 
1143   PetscFunctionBegin;
1144   if (!cm || !cn) PetscFunctionReturn(0);
1145   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
1146   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
1147   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1148 
1149   if (a->compressedrow.use){ /* use compressed row format */
1150     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1151       colam = col*am;
1152       arm   = a->compressedrow.nrows;
1153       ii    = a->compressedrow.i;
1154       ridx  = a->compressedrow.rindex;
1155       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1156 	r1 = r2 = r3 = r4 = 0.0;
1157 	n   = ii[i+1] - ii[i];
1158 	aj  = a->j + ii[i];
1159 	aa  = a->a + ii[i];
1160 	for (j=0; j<n; j++) {
1161 	  r1 += (*aa)*b1[*aj];
1162 	  r2 += (*aa)*b2[*aj];
1163 	  r3 += (*aa)*b3[*aj];
1164 	  r4 += (*aa++)*b4[*aj++];
1165 	}
1166 	c[colam       + ridx[i]] += r1;
1167 	c[colam + am  + ridx[i]] += r2;
1168 	c[colam + am2 + ridx[i]] += r3;
1169 	c[colam + am3 + ridx[i]] += r4;
1170       }
1171       b1 += bm4;
1172       b2 += bm4;
1173       b3 += bm4;
1174       b4 += bm4;
1175     }
1176     for (;col<cn; col++){     /* over extra columns of C */
1177       colam = col*am;
1178       arm   = a->compressedrow.nrows;
1179       ii    = a->compressedrow.i;
1180       ridx  = a->compressedrow.rindex;
1181       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1182 	r1 = 0.0;
1183 	n   = ii[i+1] - ii[i];
1184 	aj  = a->j + ii[i];
1185 	aa  = a->a + ii[i];
1186 
1187 	for (j=0; j<n; j++) {
1188 	  r1 += (*aa++)*b1[*aj++];
1189 	}
1190 	c[col*am + ridx[i]] += r1;
1191       }
1192       b1 += bm;
1193     }
1194   } else {
1195     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1196       colam = col*am;
1197       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1198 	r1 = r2 = r3 = r4 = 0.0;
1199 	n   = a->i[i+1] - a->i[i];
1200 	aj  = a->j + a->i[i];
1201 	aa  = a->a + a->i[i];
1202 	for (j=0; j<n; j++) {
1203 	  r1 += (*aa)*b1[*aj];
1204 	  r2 += (*aa)*b2[*aj];
1205 	  r3 += (*aa)*b3[*aj];
1206 	  r4 += (*aa++)*b4[*aj++];
1207 	}
1208 	c[colam + i]       += r1;
1209 	c[colam + am + i]  += r2;
1210 	c[colam + am2 + i] += r3;
1211 	c[colam + am3 + i] += r4;
1212       }
1213       b1 += bm4;
1214       b2 += bm4;
1215       b3 += bm4;
1216       b4 += bm4;
1217     }
1218     for (;col<cn; col++){     /* over extra columns of C */
1219       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1220 	r1 = 0.0;
1221 	n   = a->i[i+1] - a->i[i];
1222 	aj  = a->j + a->i[i];
1223 	aa  = a->a + a->i[i];
1224 
1225 	for (j=0; j<n; j++) {
1226 	  r1 += (*aa++)*b1[*aj++];
1227 	}
1228 	c[col*am + i]     += r1;
1229       }
1230       b1 += bm;
1231     }
1232   }
1233   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1234   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
1235   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
1236   PetscFunctionReturn(0);
1237 }
1238 
1239 #undef __FUNCT__
1240 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1241 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1242 {
1243   PetscErrorCode ierr;
1244   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1245   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1246   PetscInt       *bi=b->i,*bj=b->j;
1247   PetscInt       m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1248   MatScalar      *btval,*btval_den,*ba=b->a;
1249   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1250 
1251   PetscFunctionBegin;
1252   btval_den=btdense->v;
1253   ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1254   for (k=0; k<ncolors; k++) {
1255     ncolumns = coloring->ncolumns[k];
1256     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1257       col   = *(columns + colorforcol[k] + l);
1258       btcol = bj + bi[col];
1259       btval = ba + bi[col];
1260       anz   = bi[col+1] - bi[col];
1261       for (j=0; j<anz; j++){
1262         brow            = btcol[j];
1263         btval_den[brow] = btval[j];
1264       }
1265     }
1266     btval_den += m;
1267   }
1268   PetscFunctionReturn(0);
1269 }
1270 
1271 #undef __FUNCT__
1272 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1273 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1274 {
1275   PetscErrorCode ierr;
1276   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1277   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1278   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1279   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1280 
1281   PetscFunctionBegin;
1282   ierr = MatGetLocalSize(Csp,&m,PETSC_NULL);CHKERRQ(ierr);
1283   ierr = MatGetArray(Cden,&ca_den);CHKERRQ(ierr);
1284   cp_den = ca_den;
1285   for (k=0; k<ncolors; k++) {
1286     nrows = matcoloring->nrows[k];
1287     row   = rows  + colorforrow[k];
1288     idx   = spidx + colorforrow[k];
1289     for (l=0; l<nrows; l++){
1290       ca[idx[l]] = cp_den[row[l]];
1291     }
1292     cp_den += m;
1293   }
1294   ierr = MatRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 /*
1299  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1300  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1301  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1302  */
1303 #undef __FUNCT__
1304 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
1305 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *nn,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1306 {
1307   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1308   PetscErrorCode ierr;
1309   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1310   PetscInt       nz = a->i[m],row,*jj,mr,col;
1311   PetscInt       *cspidx;
1312 
1313   PetscFunctionBegin;
1314   *nn = n;
1315   if (!ia) PetscFunctionReturn(0);
1316   if (symmetric) {
1317     SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1318     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
1319   } else {
1320     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
1321     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1322     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
1323     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
1324     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
1325     jj = a->j;
1326     for (i=0; i<nz; i++) {
1327       collengths[jj[i]]++;
1328     }
1329     cia[0] = oshift;
1330     for (i=0; i<n; i++) {
1331       cia[i+1] = cia[i] + collengths[i];
1332     }
1333     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1334     jj   = a->j;
1335     for (row=0; row<m; row++) {
1336       mr = a->i[row+1] - a->i[row];
1337       for (i=0; i<mr; i++) {
1338         col = *jj++;
1339         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1340         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1341       }
1342     }
1343     ierr = PetscFree(collengths);CHKERRQ(ierr);
1344     *ia = cia; *ja = cja;
1345     *spidx = cspidx;
1346   }
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 #undef __FUNCT__
1351 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1352 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1353 {
1354   PetscErrorCode ierr;
1355 
1356   PetscFunctionBegin;
1357   if (!ia) PetscFunctionReturn(0);
1358 
1359   ierr = PetscFree(*ia);CHKERRQ(ierr);
1360   ierr = PetscFree(*ja);CHKERRQ(ierr);
1361   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 #undef __FUNCT__
1366 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1367 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1368 {
1369   PetscErrorCode ierr;
1370   PetscInt       i,n,nrows,N,j,k,m,*row_idx,*ci,*cj,ncols,col,cm;
1371   const PetscInt *is;
1372   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1373   IS             *isa;
1374   PetscBool      done;
1375   PetscBool      flg1,flg2;
1376   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1377   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1378   PetscInt       *colorforcol,*columns,*columns_i;
1379 
1380   PetscFunctionBegin;
1381   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1382 
1383   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1384   ierr = PetscObjectTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1385   ierr = PetscObjectTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1386   if (flg1 || flg2) {
1387     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1388   }
1389 
1390   N         = mat->cmap->N/bs;
1391   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1392   c->N      = mat->cmap->N/bs;
1393   c->m      = mat->rmap->N/bs;
1394   c->rstart = 0;
1395 
1396   c->ncolors = nis;
1397   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1398   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1399   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1400   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1401   colorforrow[0]    = 0;
1402   rows_i            = rows;
1403   columnsforspidx_i = columnsforspidx;
1404 
1405   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1406   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1407   colorforcol[0] = 0;
1408   columns_i      = columns;
1409 
1410   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr); /* column-wise storage of mat */
1411   if (!done) SETERRQ1(((PetscObject)mat)->comm,PETSC_ERR_SUP,"MatGetColumnIJ() not supported for matrix type %s",((PetscObject)mat)->type_name);
1412 
1413   cm = c->m;
1414   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1415   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1416   for (i=0; i<nis; i++) {
1417     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1418     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1419     c->ncolumns[i] = n;
1420     if (n) {
1421       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1422     }
1423     colorforcol[i+1] = colorforcol[i] + n;
1424     columns_i       += n;
1425 
1426     /* fast, crude version requires O(N*N) work */
1427     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1428 
1429     /* loop over columns*/
1430     for (j=0; j<n; j++) {
1431       col     = is[j];
1432       row_idx = cj + ci[col];
1433       m       = ci[col+1] - ci[col];
1434       /* loop over columns marking them in rowhit */
1435       for (k=0; k<m; k++) {
1436         idxhit[*row_idx]   = spidx[ci[col] + k];
1437         rowhit[*row_idx++] = col + 1;
1438       }
1439     }
1440     /* count the number of hits */
1441     nrows = 0;
1442     for (j=0; j<cm; j++) {
1443       if (rowhit[j]) nrows++;
1444     }
1445     c->nrows[i]      = nrows;
1446     colorforrow[i+1] = colorforrow[i] + nrows;
1447 
1448     nrows       = 0;
1449     for (j=0; j<cm; j++) {
1450       if (rowhit[j]) {
1451         rows_i[nrows]            = j;
1452         columnsforspidx_i[nrows] = idxhit[j];
1453         nrows++;
1454       }
1455     }
1456     ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1457     rows_i += nrows; columnsforspidx_i += nrows;
1458   }
1459   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr);
1460   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1461   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1462 #if defined(PETSC_USE_DEBUG)
1463   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1464 #endif
1465 
1466   c->colorforrow     = colorforrow;
1467   c->rows            = rows;
1468   c->columnsforspidx = columnsforspidx;
1469   c->colorforcol     = colorforcol;
1470   c->columns         = columns;
1471   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1472   PetscFunctionReturn(0);
1473 }
1474