xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision fb908031b23ca3865a4bbf2103ced40dab521c61)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/seq/dense.h> /*I "petscmat.h" I*/
11 /*
12 #define DEBUG_MATMATMULT
13  */
14 EXTERN_C_BEGIN
15 #undef __FUNCT__
16 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
17 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
18 {
19   PetscErrorCode ierr;
20   PetscBool      scalable=PETSC_FALSE,scalable_new=PETSC_FALSE;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX){
24     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
25     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
26     ierr = PetscOptionsBool("-matmatmult_scalable_new","Use a scalable but slower C=A*B","",scalable_new,&scalable_new,PETSC_NULL);CHKERRQ(ierr);
27     ierr = PetscOptionsEnd();CHKERRQ(ierr);
28     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
29     if (scalable_new){
30       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_new(A,B,fill,C);CHKERRQ(ierr);
31     } else if (scalable){
32       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
33     } else {
34       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
35     }
36     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
37   }
38   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
39   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
40   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
41   PetscFunctionReturn(0);
42 }
43 EXTERN_C_END
44 
45 /*
46  MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ - Get symbolic structure of C=A*B
47   Input Parameter:
48 .    am, Ai, Aj - number of rows and structure of A
49 .    bm, bn, Bi, Bj - number of rows, columns, and structure of B
50 .    fill - filll ratio See MatMatMult()
51 
52   Output Parameter:
53 .    Ci, Cj - structure of C = A*B
54 .    nspacedouble - number of extra mallocs
55  */
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable"
58 PetscErrorCode MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,PetscInt *Ci[],PetscInt *Cj[],PetscInt *nspacedouble)
59 {
60   PetscErrorCode     ierr;
61   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
62   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
63   const PetscInt     *Ai=a->i,*Aj=a->j,*Bi=b->i,*Bj=b->j,am=A->rmap->N,bm=B->rmap->N,bn=B->cmap->N;
64   const PetscInt     *aj=Aj,*bi=Bi,*bj;
65   PetscInt           *ci,*cj;
66   PetscInt           i,j,anzi,brow,bnzj,cnzi,crmax,ndouble=0;
67   PetscInt           nlnk_max,*lnk;
68 
69   PetscFunctionBegin;
70   /* Allocate ci array, arrays for fill computation and */
71   /* free space for accumulating nonzero column info */
72   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
73   ci[0] = 0;
74 
75   /* create and initialize a condensed linked list */
76   crmax = a->rmax*b->rmax;
77   nlnk_max = PetscMin(crmax,bn);
78   if (!nlnk_max && bn) {
79     nlnk_max = bn; /* in case rmax is not defined for A or B */
80 #if defined(PETSC_USE_DEBUGGING)
81     ierr = PetscPrintf(PETSC_COMM_SELF,"Warning: Armax or Brmax is not defined in MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable!\n");
82 #endif
83   }
84 #if defined(DEBUG_MATMATMULT)
85   ierr = (PETSC_COMM_SELF,"LLCondensedCreate nlnk_max=%d, bn %d, crmax %d\n",nlnk_max,bn,crmax);
86 #endif
87 
88   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
89 
90   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
91   ierr = PetscFreeSpaceGet((PetscInt)(fill*(Ai[am]+Bi[bm])),&free_space);CHKERRQ(ierr);
92   current_space = free_space;
93 
94   /* Determine ci and cj */
95   for (i=0; i<am; i++) {
96     anzi = Ai[i+1] - Ai[i];
97     cnzi = 0;
98     aj   = Aj + Ai[i];
99     for (j=0; j<anzi; j++){
100       brow = aj[j];
101       bnzj = bi[brow+1] - bi[brow];
102       bj   = Bj + bi[brow];
103       /* add non-zero cols of B into the condensed sorted linked list lnk */
104       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
105     }
106     cnzi    = lnk[0];
107     ci[i+1] = ci[i] + cnzi;
108 
109     /* If free space is not available, make more free space */
110     /* Double the amount of total space in the list */
111     if (current_space->local_remaining<cnzi) {
112       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
113       ndouble++;
114     }
115 
116     /* Copy data into free space, then initialize lnk */
117     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
118     current_space->array           += cnzi;
119     current_space->local_used      += cnzi;
120     current_space->local_remaining -= cnzi;
121   }
122 
123   /* Column indices are in the list of free space */
124   /* Allocate and copy column indices to cj, then destroy list of free space, lnk and bt */
125   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
126   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
127   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
128 
129   *Ci           = ci;
130   *Cj           = cj;
131   *nspacedouble = ndouble;
132   PetscFunctionReturn(0);
133 }
134 
135 /*
136      Does not use bitarray
137  */
138 #undef __FUNCT__
139 #define __FUNCT__ "MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable_new"
140 PetscErrorCode MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable_new(Mat A,Mat B,PetscReal fill,PetscInt *Ci[],PetscInt *Cj[],PetscInt *nspacedouble)
141 {
142   PetscErrorCode     ierr;
143   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
144   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
145   const PetscInt     *Ai=a->i,*Aj=a->j,*Bi=b->i,*Bj=b->j,am=A->rmap->N,bm=B->rmap->N,bn=B->cmap->N;
146   const PetscInt     *aj=Aj,*bi=Bi,*bj;
147   PetscInt           *ci,*cj;
148   PetscInt           i,j,anzi,brow,bnzj,cnzi,crmax,ndouble=0;
149   PetscInt           lnk_max=bn,*lnk;
150 
151   PetscFunctionBegin;
152   /* Allocate ci array, arrays for fill computation and */
153   /* free space for accumulating nonzero column info */
154   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
155   ci[0] = 0;
156 
157   /* create and initialize a condensed linked list */
158   crmax = a->rmax*b->rmax;
159   lnk_max = PetscMin(crmax,bn);
160   if (!lnk_max && bn) {
161     lnk_max = bn; /* in case rmax is not defined for A or B */
162 #if defined(PETSC_USE_DEBUGGING)
163     ierr = PetscPrintf(PETSC_COMM_SELF,"Warning: Armax or Brmax is not defined in MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable!\n");
164 #endif
165   }
166 #if defined(DEBUG_MATMATMULT)
167   ierr = (PETSC_COMM_SELF,"LLCondensedCreate lnk_max=%d, bn %d, crmax %d\n",lnk_max,bn,crmax);
168 #endif
169   ierr = PetscLLCondensedCreate_fast(lnk_max,&lnk);CHKERRQ(ierr);
170 
171   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
172   ierr = PetscFreeSpaceGet((PetscInt)(fill*(Ai[am]+Bi[bm])),&free_space);CHKERRQ(ierr);
173   current_space = free_space;
174 
175   /* Determine ci and cj */
176   for (i=0; i<am; i++) {
177     anzi = Ai[i+1] - Ai[i];
178     cnzi = 0;
179     aj   = Aj + Ai[i];
180     for (j=0; j<anzi; j++){
181       brow = aj[j];
182       bnzj = bi[brow+1] - bi[brow];
183       bj   = Bj + bi[brow];
184       /* add non-zero cols of B into the condensed sorted linked list lnk */
185       /* ierr = PetscLLCondensedView(lnk_max,lnk_max,lnk,lnk);CHKERRQ(ierr); */
186       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
187     }
188     cnzi    = lnk[1];
189     ci[i+1] = ci[i] + cnzi;
190 
191     /* If free space is not available, make more free space */
192     /* Double the amount of total space in the list */
193     if (current_space->local_remaining<cnzi) {
194       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
195       ndouble++;
196     }
197 
198     /* Copy data into free space, then initialize lnk */
199     /* ierr = PetscLLCondensedView_fast(lnk);CHKERRQ(ierr);   */
200     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
201     /* PetscIntView(cnzi,current_space->array,0); */
202     current_space->array           += cnzi;
203     current_space->local_used      += cnzi;
204     current_space->local_remaining -= cnzi;
205   }
206 
207   /* Column indices are in the list of free space */
208   /* Allocate and copy column indices to cj, then destroy list of free space, lnk and bt */
209   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
210   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
211   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
212 
213   *Ci           = ci;
214   *Cj           = cj;
215   *nspacedouble = ndouble;
216   PetscFunctionReturn(0);
217 }
218 
219 #undef __FUNCT__
220 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
221 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
222 {
223   PetscErrorCode     ierr;
224   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
225   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
226   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N,nspacedouble;
227   PetscReal          afill;
228   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
229   PetscBT            lnkbt;
230   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
231 
232   PetscFunctionBegin;
233   /* Allocate ci array, arrays for fill computation and */
234   /* free space for accumulating nonzero column info */
235   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
236   ci[0] = 0;
237 
238   /* create and initialize a linked list */
239   nlnk_max = a->rmax*b->rmax;
240   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
241   ierr = PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);CHKERRQ(ierr);
242 
243   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
244   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
245   current_space = free_space;
246 
247   /* Determine ci and cj */
248   for (i=0; i<am; i++) {
249     anzi = ai[i+1] - ai[i];
250     aj   = a->j + ai[i];
251     for (j=0; j<anzi; j++){
252       brow = aj[j];
253       bnzj = bi[brow+1] - bi[brow];
254       bj   = b->j + bi[brow];
255       /* add non-zero cols of B into the sorted linked list lnk */
256       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
257     }
258     cnzi = lnk[0];
259 
260     /* If free space is not available, make more free space */
261     /* Double the amount of total space in the list */
262     if (current_space->local_remaining<cnzi) {
263       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
264       ndouble++;
265     }
266 
267     /* Copy data into free space, then initialize lnk */
268     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
269     current_space->array           += cnzi;
270     current_space->local_used      += cnzi;
271     current_space->local_remaining -= cnzi;
272     ci[i+1] = ci[i] + cnzi;
273   }
274 
275   /* Column indices are in the list of free space */
276   /* Allocate space for cj, initialize cj, and */
277   /* destroy list of free space and other temporary array(s) */
278   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
279   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
280   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
281 
282   /* put together the new symbolic matrix */
283   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
284 
285   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
286   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
287   c = (Mat_SeqAIJ *)((*C)->data);
288   c->free_a  = PETSC_FALSE;
289   c->free_ij = PETSC_TRUE;
290   c->nonew   = 0;
291   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
292 
293   /* set MatInfo */
294   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
295   if (afill < 1.0) afill = 1.0;
296   c->maxnz                     = ci[am];
297   c->nz                        = ci[am];
298   (*C)->info.mallocs           = ndouble;
299   (*C)->info.fill_ratio_given  = fill;
300   (*C)->info.fill_ratio_needed = afill;
301 
302 #if defined(PETSC_USE_INFO)
303   if (ci[am]) {
304     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
305     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
306   } else {
307     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
308   }
309 #endif
310   PetscFunctionReturn(0);
311 }
312 
313 #undef __FUNCT__
314 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
315 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
316 {
317   PetscErrorCode ierr;
318   PetscLogDouble flops=0.0;
319   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
320   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
321   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
322   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
323   PetscInt       am=A->rmap->n,cm=C->rmap->n;
324   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
325   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
326   PetscScalar    *ab_dense;
327 
328   PetscFunctionBegin;
329   /* printf("MatMatMultNumeric_SeqAIJ_SeqAIJ...ca %p\n",c->a); */
330   if (!c->a){ /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
331     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
332     c->a      = ca;
333     c->free_a = PETSC_TRUE;
334 
335     ierr = PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);CHKERRQ(ierr);
336     ierr = PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
337     c->matmult_abdense = ab_dense;
338   } else {
339     ca       = c->a;
340     ab_dense = c->matmult_abdense;
341   }
342 
343   /* clean old values in C */
344   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
345   /* Traverse A row-wise. */
346   /* Build the ith row in C by summing over nonzero columns in A, */
347   /* the rows of B corresponding to nonzeros of A. */
348   for (i=0; i<am; i++) {
349     anzi = ai[i+1] - ai[i];
350     for (j=0; j<anzi; j++) {
351       brow = aj[j];
352       bnzi = bi[brow+1] - bi[brow];
353       bjj  = bj + bi[brow];
354       baj  = ba + bi[brow];
355       /* perform dense axpy */
356       valtmp = aa[j];
357       for (k=0; k<bnzi; k++) {
358         ab_dense[bjj[k]] += valtmp*baj[k];
359       }
360       flops += 2*bnzi;
361     }
362     aj += anzi; aa += anzi;
363 
364     cnzi = ci[i+1] - ci[i];
365     for (k=0; k<cnzi; k++) {
366       ca[k]          += ab_dense[cj[k]];
367       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
368     }
369     flops += cnzi;
370     cj += cnzi; ca += cnzi;
371   }
372   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
373   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
374   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 #undef __FUNCT__
379 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
380 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
381 {
382   PetscErrorCode ierr;
383   PetscLogDouble flops=0.0;
384   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
385   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
386   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
387   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
388   PetscInt       am=A->rmap->N,cm=C->rmap->N;
389   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
390   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
391   PetscInt       nextb;
392 
393   PetscFunctionBegin;
394 #if defined(DEBUG_MATMATMULT)
395   ierr = PetscPrintf(PETSC_COMM_SELF,"MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable...\n");CHKERRQ(ierr);
396 #endif
397   /* clean old values in C */
398   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
399   /* Traverse A row-wise. */
400   /* Build the ith row in C by summing over nonzero columns in A, */
401   /* the rows of B corresponding to nonzeros of A. */
402   for (i=0;i<am;i++) {
403     anzi = ai[i+1] - ai[i];
404     cnzi = ci[i+1] - ci[i];
405     for (j=0;j<anzi;j++) {
406       brow = aj[j];
407       bnzi = bi[brow+1] - bi[brow];
408       bjj  = bj + bi[brow];
409       baj  = ba + bi[brow];
410       /* perform sparse axpy */
411       valtmp = aa[j];
412       nextb  = 0;
413       for (k=0; nextb<bnzi; k++) {
414         if (cj[k] == bjj[nextb]){ /* ccol == bcol */
415           ca[k] += valtmp*baj[nextb++];
416         }
417       }
418       flops += 2*bnzi;
419     }
420     aj += anzi; aa += anzi;
421     cj += cnzi; ca += cnzi;
422   }
423 
424   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
425   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
426   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_new"
432 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_new(Mat A,Mat B,PetscReal fill,Mat *C)
433 {
434   PetscErrorCode ierr;
435   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
436   PetscInt       *ai=a->i,*bi=b->i,*ci,*cj;
437   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N,nspacedouble;
438   MatScalar      *ca;
439   PetscReal      afill;
440 
441   PetscFunctionBegin;
442 #if defined(DEBUG_MATMATMULT)
443   ierr = PetscPrintf(PETSC_COMM_SELF,"MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable Armax %d, Brmax %d\n",a->rmax,b->rmax);CHKERRQ(ierr);
444 #endif
445   /* Get ci and cj */
446   ierr = MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable_new(A,B,fill,&ci,&cj,&nspacedouble);CHKERRQ(ierr);
447 
448   /* Allocate space for ca */
449   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
450   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
451 
452   /* put together the new symbolic matrix */
453   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
454 
455   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
456   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
457   c = (Mat_SeqAIJ *)((*C)->data);
458   c->free_a   = PETSC_TRUE;
459   c->free_ij  = PETSC_TRUE;
460   c->nonew    = 0;
461   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
462 
463   /* set MatInfo */
464   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
465   if (afill < 1.0) afill = 1.0;
466   c->maxnz                     = ci[am];
467   c->nz                        = ci[am];
468   (*C)->info.mallocs           = nspacedouble;
469   (*C)->info.fill_ratio_given  = fill;
470   (*C)->info.fill_ratio_needed = afill;
471 
472 #if defined(PETSC_USE_INFO)
473   if (ci[am]) {
474     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
475     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
476   } else {
477     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
478   }
479 #endif
480   PetscFunctionReturn(0);
481 }
482 
483 
484 #undef __FUNCT__
485 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
486 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
487 {
488   PetscErrorCode ierr;
489   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
490   PetscInt       *ai=a->i,*bi=b->i,*ci,*cj;
491   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N,nspacedouble;
492   MatScalar      *ca;
493   PetscReal      afill;
494 
495   PetscFunctionBegin;
496   /* Get ci and cj */
497   ierr = MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable(A,B,fill,&ci,&cj,&nspacedouble);CHKERRQ(ierr);
498 
499   /* Allocate space for ca */
500   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
501   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
502 
503   /* put together the new symbolic matrix */
504   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
505 
506   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
507   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
508   c = (Mat_SeqAIJ *)((*C)->data);
509   c->free_a   = PETSC_TRUE;
510   c->free_ij  = PETSC_TRUE;
511   c->nonew    = 0;
512   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
513 
514   /* set MatInfo */
515   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
516   if (afill < 1.0) afill = 1.0;
517   c->maxnz                     = ci[am];
518   c->nz                        = ci[am];
519   (*C)->info.mallocs           = nspacedouble;
520   (*C)->info.fill_ratio_given  = fill;
521   (*C)->info.fill_ratio_needed = afill;
522 
523 #if defined(PETSC_USE_INFO)
524   if (ci[am]) {
525     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
526     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
527   } else {
528     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
529   }
530 #endif
531   PetscFunctionReturn(0);
532 }
533 
534 
535 /* This routine is not used. Should be removed! */
536 #undef __FUNCT__
537 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
538 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
539 {
540   PetscErrorCode ierr;
541 
542   PetscFunctionBegin;
543   if (scall == MAT_INITIAL_MATRIX){
544     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
545   }
546   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
547   PetscFunctionReturn(0);
548 }
549 
550 #undef __FUNCT__
551 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
552 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
553 {
554   PetscErrorCode      ierr;
555   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
556 
557   PetscFunctionBegin;
558   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
559   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
560   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
561   ierr = PetscFree(multtrans);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 
565 #undef __FUNCT__
566 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
567 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
568 {
569   PetscErrorCode      ierr;
570   PetscContainer      container;
571   Mat_MatMatTransMult *multtrans=PETSC_NULL;
572 
573   PetscFunctionBegin;
574   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
575   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
576   ierr = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
577   A->ops->destroy   = multtrans->destroy;
578   if (A->ops->destroy) {
579     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
580   }
581   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 
585 #undef __FUNCT__
586 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
587 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
588 {
589   PetscErrorCode      ierr;
590   Mat                 Bt;
591   PetscInt            *bti,*btj;
592   Mat_MatMatTransMult *multtrans;
593   PetscContainer      container;
594   PetscLogDouble      t0,tf,etime2=0.0;
595 
596   PetscFunctionBegin;
597   ierr = PetscGetTime(&t0);CHKERRQ(ierr);
598    /* create symbolic Bt */
599   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
600   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);CHKERRQ(ierr);
601 
602   /* get symbolic C=A*Bt */
603   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
604 
605   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
606   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
607 
608   /* attach the supporting struct to C */
609   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
610   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
611   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
612   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
613   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
614 
615   multtrans->usecoloring = PETSC_FALSE;
616   multtrans->destroy = (*C)->ops->destroy;
617   (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
618 
619   ierr = PetscGetTime(&tf);CHKERRQ(ierr);
620   etime2 += tf - t0;
621 
622   ierr = PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);CHKERRQ(ierr);
623   if (multtrans->usecoloring){
624     /* Create MatTransposeColoring from symbolic C=A*B^T */
625     MatTransposeColoring matcoloring;
626     ISColoring           iscoloring;
627     Mat                  Bt_dense,C_dense;
628     PetscLogDouble       etime0=0.0,etime01=0.0,etime1=0.0;
629 
630     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
631     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
632     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
633     etime0 += tf - t0;
634 
635     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
636     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
637     multtrans->matcoloring = matcoloring;
638     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
639     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
640     etime01 += tf - t0;
641 
642     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
643     /* Create Bt_dense and C_dense = A*Bt_dense */
644     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
645     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
646     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
647     ierr = MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);CHKERRQ(ierr);
648     Bt_dense->assembled = PETSC_TRUE;
649     multtrans->Bt_den = Bt_dense;
650 
651     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
652     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
653     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
654     ierr = MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);CHKERRQ(ierr);
655     Bt_dense->assembled = PETSC_TRUE;
656     multtrans->ABt_den = C_dense;
657     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
658     etime1 += tf - t0;
659 
660 #if defined(PETSC_USE_INFO)
661     {
662     Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
663     ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
664     ierr = PetscInfo5(*C,"Sym = GetColor %g + ColorCreate %g + MatDenseCreate %g + non-colorSym %g = %g\n",etime0,etime01,etime1,etime2,etime0+etime01+etime1+etime2);
665     }
666 #endif
667   }
668   /* clean up */
669   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
670   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
671 
672 
673 
674 #if defined(INEFFICIENT_ALGORITHM)
675   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
676   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
677   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
678   PetscInt           *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
679   PetscInt           am=A->rmap->N,bm=B->rmap->N;
680   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
681   MatScalar          *ca;
682   PetscBT            lnkbt;
683   PetscReal          afill;
684 
685   /* Allocate row pointer array ci  */
686   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
687   ci[0] = 0;
688 
689   /* Create and initialize a linked list for C columns */
690   nlnk = bm+1;
691   ierr = PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
692 
693   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
694   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
695   current_space = free_space;
696 
697   /* Determine symbolic info for each row of the product A*B^T: */
698   for (i=0; i<am; i++) {
699     anzi = ai[i+1] - ai[i];
700     cnzi = 0;
701     acol = aj + ai[i];
702     for (j=0; j<bm; j++){
703       bnzj = bi[j+1] - bi[j];
704       bcol= bj + bi[j];
705       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
706       ka = 0; kb = 0;
707       while (ka < anzi && kb < bnzj){
708         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
709         if (ka == anzi) break;
710         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
711         if (kb == bnzj) break;
712         if (acol[ka] == bcol[kb]){ /* add nonzero c(i,j) to lnk */
713           index[0] = j;
714           ierr = PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
715           cnzi++;
716           break;
717         }
718       }
719     }
720 
721     /* If free space is not available, make more free space */
722     /* Double the amount of total space in the list */
723     if (current_space->local_remaining<cnzi) {
724       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
725       nspacedouble++;
726     }
727 
728     /* Copy data into free space, then initialize lnk */
729     ierr = PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
730     current_space->array           += cnzi;
731     current_space->local_used      += cnzi;
732     current_space->local_remaining -= cnzi;
733 
734     ci[i+1] = ci[i] + cnzi;
735   }
736 
737 
738   /* Column indices are in the list of free space.
739      Allocate array cj, copy column indices to cj, and destroy list of free space */
740   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
741   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
742   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
743 
744   /* Allocate space for ca */
745   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
746   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
747 
748   /* put together the new symbolic matrix */
749   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);CHKERRQ(ierr);
750 
751   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
752   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
753   c = (Mat_SeqAIJ *)((*C)->data);
754   c->free_a   = PETSC_TRUE;
755   c->free_ij  = PETSC_TRUE;
756   c->nonew    = 0;
757 
758   /* set MatInfo */
759   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
760   if (afill < 1.0) afill = 1.0;
761   c->maxnz                     = ci[am];
762   c->nz                        = ci[am];
763   (*C)->info.mallocs           = nspacedouble;
764   (*C)->info.fill_ratio_given  = fill;
765   (*C)->info.fill_ratio_needed = afill;
766 
767 #if defined(PETSC_USE_INFO)
768   if (ci[am]) {
769     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
770     ierr = PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
771   } else {
772     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
773   }
774 #endif
775 #endif
776   PetscFunctionReturn(0);
777 }
778 
779 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
780 #undef __FUNCT__
781 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
782 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
783 {
784   PetscErrorCode ierr;
785   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
786   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
787   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
788   PetscLogDouble flops=0.0;
789   MatScalar      *aa=a->a,*aval,*ba=b->a,*bval,*ca,*cval;
790   Mat_MatMatTransMult *multtrans;
791   PetscContainer      container;
792 #if defined(USE_ARRAY)
793   MatScalar      *spdot;
794 #endif
795 
796   PetscFunctionBegin;
797   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
798   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
799   ierr  = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
800   if (multtrans->usecoloring){
801     MatTransposeColoring  matcoloring = multtrans->matcoloring;
802     Mat                   Bt_dense;
803     PetscInt              m,n;
804     PetscLogDouble t0,tf,etime0=0.0,etime1=0.0,etime2=0.0;
805     Mat C_dense = multtrans->ABt_den;
806 
807     Bt_dense = multtrans->Bt_den;
808     ierr = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
809 
810     /* Get Bt_dense by Apply MatTransposeColoring to B */
811     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
812     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
813     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
814     etime0 += tf - t0;
815 
816     /* C_dense = A*Bt_dense */
817     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
818     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
819     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
820     etime2 += tf - t0;
821 
822     /* Recover C from C_dense */
823     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
824     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
825     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
826     etime1 += tf - t0;
827 #if defined(PETSC_USE_INFO)
828     ierr = PetscInfo4(C,"Num = ColoringApply: %g %g + Mult_sp_dense %g = %g\n",etime0,etime1,etime2,etime0+etime1+etime2);
829 #endif
830     PetscFunctionReturn(0);
831   }
832 
833 #if defined(USE_ARRAY)
834   /* allocate an array for implementing sparse inner-product */
835   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
836   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
837 #endif
838 
839   /* clear old values in C */
840   if (!c->a){
841     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
842     c->a      = ca;
843     c->free_a = PETSC_TRUE;
844   } else {
845     ca =  c->a;
846   }
847   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
848 
849   for (i=0; i<cm; i++) {
850     anzi = ai[i+1] - ai[i];
851     acol = aj + ai[i];
852     aval = aa + ai[i];
853     cnzi = ci[i+1] - ci[i];
854     ccol = cj + ci[i];
855     cval = ca + ci[i];
856     for (j=0; j<cnzi; j++){
857       brow = ccol[j];
858       bnzj = bi[brow+1] - bi[brow];
859       bcol = bj + bi[brow];
860       bval = ba + bi[brow];
861 
862       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
863 #if defined(USE_ARRAY)
864       /* put ba to spdot array */
865       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
866       /* c(i,j)=A[i,:]*B[j,:]^T */
867       for (nexta=0; nexta<anzi; nexta++){
868         cval[j] += spdot[acol[nexta]]*aval[nexta];
869       }
870       /* zero spdot array */
871       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
872 #else
873       nexta = 0; nextb = 0;
874       while (nexta<anzi && nextb<bnzj){
875         while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
876         if (nexta == anzi) break;
877         while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
878         if (nextb == bnzj) break;
879         if (acol[nexta] == bcol[nextb]){
880           cval[j] += aval[nexta]*bval[nextb];
881           nexta++; nextb++;
882           flops += 2;
883         }
884       }
885 #endif
886     }
887   }
888   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
889   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
890   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
891 #if defined(USE_ARRAY)
892   ierr = PetscFree(spdot);
893 #endif
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
899 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) {
900   PetscErrorCode ierr;
901 
902   PetscFunctionBegin;
903   if (scall == MAT_INITIAL_MATRIX){
904     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
905   }
906   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
907   PetscFunctionReturn(0);
908 }
909 
910 #undef __FUNCT__
911 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
912 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
913 {
914   PetscErrorCode ierr;
915   Mat            At;
916   PetscInt       *ati,*atj;
917 
918   PetscFunctionBegin;
919   /* create symbolic At */
920   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
921   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);CHKERRQ(ierr);
922 
923   /* get symbolic C=At*B */
924   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
925 
926   /* clean up */
927   ierr = MatDestroy(&At);CHKERRQ(ierr);
928   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 #undef __FUNCT__
933 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
934 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
935 {
936   PetscErrorCode ierr;
937   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
938   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
939   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
940   PetscLogDouble flops=0.0;
941   MatScalar      *aa=a->a,*ba,*ca,*caj;
942 
943   PetscFunctionBegin;
944   if (!c->a){
945     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
946     c->a      = ca;
947     c->free_a = PETSC_TRUE;
948   } else {
949     ca = c->a;
950   }
951   /* clear old values in C */
952   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
953 
954   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
955   for (i=0;i<am;i++) {
956     bj   = b->j + bi[i];
957     ba   = b->a + bi[i];
958     bnzi = bi[i+1] - bi[i];
959     anzi = ai[i+1] - ai[i];
960     for (j=0; j<anzi; j++) {
961       nextb = 0;
962       crow  = *aj++;
963       cjj   = cj + ci[crow];
964       caj   = ca + ci[crow];
965       /* perform sparse axpy operation.  Note cjj includes bj. */
966       for (k=0; nextb<bnzi; k++) {
967         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
968           caj[k] += (*aa)*(*(ba+nextb));
969           nextb++;
970         }
971       }
972       flops += 2*bnzi;
973       aa++;
974     }
975   }
976 
977   /* Assemble the final matrix and clean up */
978   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
979   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
980   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
981   PetscFunctionReturn(0);
982 }
983 
984 EXTERN_C_BEGIN
985 #undef __FUNCT__
986 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
987 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
988 {
989   PetscErrorCode ierr;
990 
991   PetscFunctionBegin;
992   if (scall == MAT_INITIAL_MATRIX){
993     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
994   }
995   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
996   PetscFunctionReturn(0);
997 }
998 EXTERN_C_END
999 
1000 #undef __FUNCT__
1001 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1002 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1003 {
1004   PetscErrorCode ierr;
1005 
1006   PetscFunctionBegin;
1007   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1008   (*C)->ops->matmult = MatMatMult_SeqAIJ_SeqDense;
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 #undef __FUNCT__
1013 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1014 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1015 {
1016   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1017   PetscErrorCode ierr;
1018   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1019   MatScalar      *aa;
1020   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
1021   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
1022 
1023   PetscFunctionBegin;
1024   if (!cm || !cn) PetscFunctionReturn(0);
1025   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1026   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1027   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1028   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
1029   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
1030   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1031   for (col=0; col<cn-4; col += 4){  /* over columns of C */
1032     colam = col*am;
1033     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1034       r1 = r2 = r3 = r4 = 0.0;
1035       n   = a->i[i+1] - a->i[i];
1036       aj  = a->j + a->i[i];
1037       aa  = a->a + a->i[i];
1038       for (j=0; j<n; j++) {
1039         r1 += (*aa)*b1[*aj];
1040         r2 += (*aa)*b2[*aj];
1041         r3 += (*aa)*b3[*aj];
1042         r4 += (*aa++)*b4[*aj++];
1043       }
1044       c[colam + i]       = r1;
1045       c[colam + am + i]  = r2;
1046       c[colam + am2 + i] = r3;
1047       c[colam + am3 + i] = r4;
1048     }
1049     b1 += bm4;
1050     b2 += bm4;
1051     b3 += bm4;
1052     b4 += bm4;
1053   }
1054   for (;col<cn; col++){     /* over extra columns of C */
1055     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1056       r1 = 0.0;
1057       n   = a->i[i+1] - a->i[i];
1058       aj  = a->j + a->i[i];
1059       aa  = a->a + a->i[i];
1060 
1061       for (j=0; j<n; j++) {
1062         r1 += (*aa++)*b1[*aj++];
1063       }
1064       c[col*am + i]     = r1;
1065     }
1066     b1 += bm;
1067   }
1068   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1069   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
1070   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
1071   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1072   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 /*
1077    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1078 */
1079 #undef __FUNCT__
1080 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1081 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1082 {
1083   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1084   PetscErrorCode ierr;
1085   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1086   MatScalar      *aa;
1087   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1088   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1089 
1090   PetscFunctionBegin;
1091   if (!cm || !cn) PetscFunctionReturn(0);
1092   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
1093   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
1094   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1095 
1096   if (a->compressedrow.use){ /* use compressed row format */
1097     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1098       colam = col*am;
1099       arm   = a->compressedrow.nrows;
1100       ii    = a->compressedrow.i;
1101       ridx  = a->compressedrow.rindex;
1102       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1103 	r1 = r2 = r3 = r4 = 0.0;
1104 	n   = ii[i+1] - ii[i];
1105 	aj  = a->j + ii[i];
1106 	aa  = a->a + ii[i];
1107 	for (j=0; j<n; j++) {
1108 	  r1 += (*aa)*b1[*aj];
1109 	  r2 += (*aa)*b2[*aj];
1110 	  r3 += (*aa)*b3[*aj];
1111 	  r4 += (*aa++)*b4[*aj++];
1112 	}
1113 	c[colam       + ridx[i]] += r1;
1114 	c[colam + am  + ridx[i]] += r2;
1115 	c[colam + am2 + ridx[i]] += r3;
1116 	c[colam + am3 + ridx[i]] += r4;
1117       }
1118       b1 += bm4;
1119       b2 += bm4;
1120       b3 += bm4;
1121       b4 += bm4;
1122     }
1123     for (;col<cn; col++){     /* over extra columns of C */
1124       colam = col*am;
1125       arm   = a->compressedrow.nrows;
1126       ii    = a->compressedrow.i;
1127       ridx  = a->compressedrow.rindex;
1128       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1129 	r1 = 0.0;
1130 	n   = ii[i+1] - ii[i];
1131 	aj  = a->j + ii[i];
1132 	aa  = a->a + ii[i];
1133 
1134 	for (j=0; j<n; j++) {
1135 	  r1 += (*aa++)*b1[*aj++];
1136 	}
1137 	c[col*am + ridx[i]] += r1;
1138       }
1139       b1 += bm;
1140     }
1141   } else {
1142     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1143       colam = col*am;
1144       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1145 	r1 = r2 = r3 = r4 = 0.0;
1146 	n   = a->i[i+1] - a->i[i];
1147 	aj  = a->j + a->i[i];
1148 	aa  = a->a + a->i[i];
1149 	for (j=0; j<n; j++) {
1150 	  r1 += (*aa)*b1[*aj];
1151 	  r2 += (*aa)*b2[*aj];
1152 	  r3 += (*aa)*b3[*aj];
1153 	  r4 += (*aa++)*b4[*aj++];
1154 	}
1155 	c[colam + i]       += r1;
1156 	c[colam + am + i]  += r2;
1157 	c[colam + am2 + i] += r3;
1158 	c[colam + am3 + i] += r4;
1159       }
1160       b1 += bm4;
1161       b2 += bm4;
1162       b3 += bm4;
1163       b4 += bm4;
1164     }
1165     for (;col<cn; col++){     /* over extra columns of C */
1166       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1167 	r1 = 0.0;
1168 	n   = a->i[i+1] - a->i[i];
1169 	aj  = a->j + a->i[i];
1170 	aa  = a->a + a->i[i];
1171 
1172 	for (j=0; j<n; j++) {
1173 	  r1 += (*aa++)*b1[*aj++];
1174 	}
1175 	c[col*am + i]     += r1;
1176       }
1177       b1 += bm;
1178     }
1179   }
1180   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1181   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
1182   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
1183   PetscFunctionReturn(0);
1184 }
1185 
1186 #undef __FUNCT__
1187 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1188 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1189 {
1190   PetscErrorCode ierr;
1191   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1192   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1193   PetscInt       *bi=b->i,*bj=b->j;
1194   PetscInt       m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1195   MatScalar      *btval,*btval_den,*ba=b->a;
1196   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1197 
1198   PetscFunctionBegin;
1199   btval_den=btdense->v;
1200   ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1201   for (k=0; k<ncolors; k++) {
1202     ncolumns = coloring->ncolumns[k];
1203     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1204       col   = *(columns + colorforcol[k] + l);
1205       btcol = bj + bi[col];
1206       btval = ba + bi[col];
1207       anz   = bi[col+1] - bi[col];
1208       for (j=0; j<anz; j++){
1209         brow            = btcol[j];
1210         btval_den[brow] = btval[j];
1211       }
1212     }
1213     btval_den += m;
1214   }
1215   PetscFunctionReturn(0);
1216 }
1217 
1218 #undef __FUNCT__
1219 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1220 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1221 {
1222   PetscErrorCode ierr;
1223   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1224   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1225   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1226   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1227 
1228   PetscFunctionBegin;
1229   ierr = MatGetLocalSize(Csp,&m,PETSC_NULL);CHKERRQ(ierr);
1230   ierr = MatGetArray(Cden,&ca_den);CHKERRQ(ierr);
1231   cp_den = ca_den;
1232   for (k=0; k<ncolors; k++) {
1233     nrows = matcoloring->nrows[k];
1234     row   = rows  + colorforrow[k];
1235     idx   = spidx + colorforrow[k];
1236     for (l=0; l<nrows; l++){
1237       ca[idx[l]] = cp_den[row[l]];
1238     }
1239     cp_den += m;
1240   }
1241   ierr = MatRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 /*
1246  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1247  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1248  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1249  */
1250 #undef __FUNCT__
1251 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
1252 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *nn,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1253 {
1254   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1255   PetscErrorCode ierr;
1256   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1257   PetscInt       nz = a->i[m],row,*jj,mr,col;
1258   PetscInt       *cspidx;
1259 
1260   PetscFunctionBegin;
1261   *nn = n;
1262   if (!ia) PetscFunctionReturn(0);
1263   if (symmetric) {
1264     SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1265     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
1266   } else {
1267     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
1268     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1269     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
1270     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
1271     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
1272     jj = a->j;
1273     for (i=0; i<nz; i++) {
1274       collengths[jj[i]]++;
1275     }
1276     cia[0] = oshift;
1277     for (i=0; i<n; i++) {
1278       cia[i+1] = cia[i] + collengths[i];
1279     }
1280     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1281     jj   = a->j;
1282     for (row=0; row<m; row++) {
1283       mr = a->i[row+1] - a->i[row];
1284       for (i=0; i<mr; i++) {
1285         col = *jj++;
1286         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1287         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1288       }
1289     }
1290     ierr = PetscFree(collengths);CHKERRQ(ierr);
1291     *ia = cia; *ja = cja;
1292     *spidx = cspidx;
1293   }
1294   PetscFunctionReturn(0);
1295 }
1296 
1297 #undef __FUNCT__
1298 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1299 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1300 {
1301   PetscErrorCode ierr;
1302 
1303   PetscFunctionBegin;
1304   if (!ia) PetscFunctionReturn(0);
1305 
1306   ierr = PetscFree(*ia);CHKERRQ(ierr);
1307   ierr = PetscFree(*ja);CHKERRQ(ierr);
1308   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1309   PetscFunctionReturn(0);
1310 }
1311 
1312 #undef __FUNCT__
1313 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1314 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1315 {
1316   PetscErrorCode ierr;
1317   PetscInt       i,n,nrows,N,j,k,m,*row_idx,*ci,*cj,ncols,col,cm;
1318   const PetscInt *is;
1319   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1320   IS             *isa;
1321   PetscBool      done;
1322   PetscBool      flg1,flg2;
1323   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1324   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1325   PetscInt       *colorforcol,*columns,*columns_i;
1326 
1327   PetscFunctionBegin;
1328   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1329 
1330   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1331   ierr = PetscTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1332   ierr = PetscTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1333   if (flg1 || flg2) {
1334     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1335   }
1336 
1337   N         = mat->cmap->N/bs;
1338   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1339   c->N      = mat->cmap->N/bs;
1340   c->m      = mat->rmap->N/bs;
1341   c->rstart = 0;
1342 
1343   c->ncolors = nis;
1344   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1345   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1346   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1347   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1348   colorforrow[0]    = 0;
1349   rows_i            = rows;
1350   columnsforspidx_i = columnsforspidx;
1351 
1352   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1353   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1354   colorforcol[0] = 0;
1355   columns_i      = columns;
1356 
1357   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr); /* column-wise storage of mat */
1358   if (!done) SETERRQ1(((PetscObject)mat)->comm,PETSC_ERR_SUP,"MatGetColumnIJ() not supported for matrix type %s",((PetscObject)mat)->type_name);
1359 
1360   cm = c->m;
1361   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1362   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1363   for (i=0; i<nis; i++) {
1364     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1365     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1366     c->ncolumns[i] = n;
1367     if (n) {
1368       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1369     }
1370     colorforcol[i+1] = colorforcol[i] + n;
1371     columns_i       += n;
1372 
1373     /* fast, crude version requires O(N*N) work */
1374     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1375 
1376     /* loop over columns*/
1377     for (j=0; j<n; j++) {
1378       col     = is[j];
1379       row_idx = cj + ci[col];
1380       m       = ci[col+1] - ci[col];
1381       /* loop over columns marking them in rowhit */
1382       for (k=0; k<m; k++) {
1383         idxhit[*row_idx]   = spidx[ci[col] + k];
1384         rowhit[*row_idx++] = col + 1;
1385       }
1386     }
1387     /* count the number of hits */
1388     nrows = 0;
1389     for (j=0; j<cm; j++) {
1390       if (rowhit[j]) nrows++;
1391     }
1392     c->nrows[i]      = nrows;
1393     colorforrow[i+1] = colorforrow[i] + nrows;
1394 
1395     nrows       = 0;
1396     for (j=0; j<cm; j++) {
1397       if (rowhit[j]) {
1398         rows_i[nrows]            = j;
1399         columnsforspidx_i[nrows] = idxhit[j];
1400         nrows++;
1401       }
1402     }
1403     ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1404     rows_i += nrows; columnsforspidx_i += nrows;
1405   }
1406   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr);
1407   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1408   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1409 #if defined(PETSC_USE_DEBUG)
1410   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1411 #endif
1412 
1413   c->colorforrow     = colorforrow;
1414   c->rows            = rows;
1415   c->columnsforspidx = columnsforspidx;
1416   c->colorforcol     = colorforcol;
1417   c->columns         = columns;
1418   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1419   PetscFunctionReturn(0);
1420 }
1421