xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 75784686dc399bb300d07717fa3a89d6195cd2fa)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/seq/dense.h> /*I "petscmat.h" I*/
11 /*
12 #define DEBUG_MATMATMULT
13  */
14 EXTERN_C_BEGIN
15 #undef __FUNCT__
16 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
17 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
18 {
19   PetscErrorCode ierr;
20   PetscBool      scalable=PETSC_FALSE,scalable_new=PETSC_FALSE;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX){
24     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
25     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
26     ierr = PetscOptionsBool("-matmatmult_scalable_new","Use a scalable but slower C=A*B","",scalable_new,&scalable_new,PETSC_NULL);CHKERRQ(ierr);
27     ierr = PetscOptionsEnd();CHKERRQ(ierr);
28     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
29     if (scalable_new){
30       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_new(A,B,fill,C);CHKERRQ(ierr);
31     } else if (scalable){
32       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
33     } else {
34       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
35     }
36     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
37   }
38   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
39   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
40   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
41   PetscFunctionReturn(0);
42 }
43 EXTERN_C_END
44 
45 /*
46  MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ - Get symbolic structure of C=A*B
47   Input Parameter:
48 .    am, Ai, Aj - number of rows and structure of A
49 .    bm, bn, Bi, Bj - number of rows, columns, and structure of B
50 .    fill - filll ratio See MatMatMult()
51 
52   Output Parameter:
53 .    Ci, Cj - structure of C = A*B
54 .    nspacedouble - number of extra mallocs
55  */
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ"
58 PetscErrorCode MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,PetscInt *Ci[],PetscInt *Cj[],PetscInt *nspacedouble)
59 {
60   PetscErrorCode     ierr;
61   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
62   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
63   const PetscInt     *Ai=a->i,*Aj=a->j,*Bi=b->i,*Bj=b->j,am=A->rmap->N,bm=B->rmap->N,bn=B->cmap->N;
64   const PetscInt     *aj=Aj,*bi=Bi,*bj=Bj,*bjj;
65   PetscInt           *ci,*cj;
66   PetscInt           i,j,anzi,brow,bnzj,cnzi,nlnk,*lnk,ndouble=0;
67   PetscBT            lnkbt;
68 
69   PetscFunctionBegin;
70   /* Allocate ci array, arrays for fill computation and */
71   /* free space for accumulating nonzero column info */
72   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
73   ci[0] = 0;
74 
75   /* create and initialize a linked list */
76   nlnk = bn+1;
77   ierr = PetscLLCreate(bn,bn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
78 
79   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
80   ierr = PetscFreeSpaceGet((PetscInt)(fill*(Ai[am]+Bi[bm])),&free_space);CHKERRQ(ierr);
81   current_space = free_space;
82 
83   /* Determine ci and cj */
84   for (i=0; i<am; i++) {
85     anzi = Ai[i+1] - Ai[i];
86     cnzi = 0;
87     aj   = Aj + Ai[i];
88     for (j=0; j<anzi; j++){
89       brow = aj[j];
90       bnzj = bi[brow+1] - bi[brow];
91       bjj  = bj + bi[brow];
92       /* add non-zero cols of B into the sorted linked list lnk */
93       ierr = PetscLLAddSorted(bnzj,bjj,bn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
94       cnzi += nlnk;
95     }
96 
97     /* If free space is not available, make more free space */
98     /* Double the amount of total space in the list */
99     if (current_space->local_remaining<cnzi) {
100       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
101       ndouble++;
102     }
103 
104     /* Copy data into free space, then initialize lnk */
105     ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
106     current_space->array           += cnzi;
107     current_space->local_used      += cnzi;
108     current_space->local_remaining -= cnzi;
109     ci[i+1] = ci[i] + cnzi;
110   }
111 
112   /* Column indices are in the list of free space */
113   /* Allocate space for cj, initialize cj, and */
114   /* destroy list of free space and other temporary array(s) */
115   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
116   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
117   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
118 
119   *Ci           = ci;
120   *Cj           = cj;
121   *nspacedouble = ndouble;
122   PetscFunctionReturn(0);
123 }
124 
125 #undef __FUNCT__
126 #define __FUNCT__ "MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable"
127 PetscErrorCode MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,PetscInt *Ci[],PetscInt *Cj[],PetscInt *nspacedouble)
128 {
129   PetscErrorCode     ierr;
130   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
131   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
132   const PetscInt     *Ai=a->i,*Aj=a->j,*Bi=b->i,*Bj=b->j,am=A->rmap->N,bm=B->rmap->N,bn=B->cmap->N;
133   const PetscInt     *aj=Aj,*bi=Bi,*bj;
134   PetscInt           *ci,*cj;
135   PetscInt           i,j,anzi,brow,bnzj,cnzi,crmax,ndouble=0;
136   PetscBT            bt;
137   PetscInt           nlnk_max,lnk_max=bn,*lnk;
138 
139   PetscFunctionBegin;
140   /* Allocate ci array, arrays for fill computation and */
141   /* free space for accumulating nonzero column info */
142   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
143   ci[0] = 0;
144 
145   /* create and initialize a condensed linked list */
146   crmax = a->rmax*b->rmax;
147   nlnk_max = PetscMin(crmax,bn);
148   if (!nlnk_max && bn) {
149     nlnk_max = bn; /* in case rmax is not defined for A or B */
150 #if defined(PETSC_USE_DEBUGGING)
151     ierr = PetscPrintf(PETSC_COMM_SELF,"Warning: Armax or Brmax is not defined in MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable!\n");
152 #endif
153   }
154 #if defined(DEBUG_MATMATMULT)
155   ierr = (PETSC_COMM_SELF,"LLCondensedCreate nlnk_max=%d, bn %d, crmax %d\n",nlnk_max,bn,crmax);
156 #endif
157   ierr = PetscLLCondensedCreate(nlnk_max,lnk_max,lnk,bt);CHKERRQ(ierr);
158 
159   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
160   ierr = PetscFreeSpaceGet((PetscInt)(fill*(Ai[am]+Bi[bm])),&free_space);CHKERRQ(ierr);
161   current_space = free_space;
162 
163   /* Determine ci and cj */
164   for (i=0; i<am; i++) {
165     anzi = Ai[i+1] - Ai[i];
166     cnzi = 0;
167     aj   = Aj + Ai[i];
168     for (j=0; j<anzi; j++){
169       brow = aj[j];
170       bnzj = bi[brow+1] - bi[brow];
171       bj   = Bj + bi[brow];
172       /* add non-zero cols of B into the condensed sorted linked list lnk */
173       ierr = PetscLLCondensedAddSorted(nlnk_max,lnk_max,bnzj,bj,lnk,bt);CHKERRQ(ierr);
174     }
175     cnzi    = lnk[0];
176     ci[i+1] = ci[i] + cnzi;
177 
178     /* If free space is not available, make more free space */
179     /* Double the amount of total space in the list */
180     if (current_space->local_remaining<cnzi) {
181       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
182       ndouble++;
183     }
184 
185     /* Copy data into free space, then initialize lnk */
186     ierr = PetscLLCondensedClean(nlnk_max,lnk_max,cnzi,current_space->array,lnk,bt);CHKERRQ(ierr);
187     current_space->array           += cnzi;
188     current_space->local_used      += cnzi;
189     current_space->local_remaining -= cnzi;
190   }
191 
192   /* Column indices are in the list of free space */
193   /* Allocate and copy column indices to cj, then destroy list of free space, lnk and bt */
194   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
195   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
196   ierr = PetscLLCondensedDestroy(lnk,bt);CHKERRQ(ierr);
197 
198   *Ci           = ci;
199   *Cj           = cj;
200   *nspacedouble = ndouble;
201   PetscFunctionReturn(0);
202 }
203 
204 /*
205      Does not use bitarray
206  */
207 #undef __FUNCT__
208 #define __FUNCT__ "MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable_new"
209 PetscErrorCode MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable_new(Mat A,Mat B,PetscReal fill,PetscInt *Ci[],PetscInt *Cj[],PetscInt *nspacedouble)
210 {
211   PetscErrorCode     ierr;
212   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
213   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
214   const PetscInt     *Ai=a->i,*Aj=a->j,*Bi=b->i,*Bj=b->j,am=A->rmap->N,bm=B->rmap->N,bn=B->cmap->N;
215   const PetscInt     *aj=Aj,*bi=Bi,*bj;
216   PetscInt           *ci,*cj;
217   PetscInt           i,j,anzi,brow,bnzj,cnzi,crmax,ndouble=0;
218   PetscInt           lnk_max=bn,*lnk;
219 
220   PetscFunctionBegin;
221   /* Allocate ci array, arrays for fill computation and */
222   /* free space for accumulating nonzero column info */
223   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
224   ci[0] = 0;
225 
226   /* create and initialize a condensed linked list */
227   crmax = a->rmax*b->rmax;
228   lnk_max = PetscMin(crmax,bn);
229   if (!lnk_max && bn) {
230     lnk_max = bn; /* in case rmax is not defined for A or B */
231 #if defined(PETSC_USE_DEBUGGING)
232     ierr = PetscPrintf(PETSC_COMM_SELF,"Warning: Armax or Brmax is not defined in MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable!\n");
233 #endif
234   }
235 #if defined(DEBUG_MATMATMULT)
236   ierr = (PETSC_COMM_SELF,"LLCondensedCreate lnk_max=%d, bn %d, crmax %d\n",lnk_max,bn,crmax);
237 #endif
238   ierr = PetscLLCondensedCreate_fast(lnk_max,&lnk);CHKERRQ(ierr);
239 
240   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
241   ierr = PetscFreeSpaceGet((PetscInt)(fill*(Ai[am]+Bi[bm])),&free_space);CHKERRQ(ierr);
242   current_space = free_space;
243 
244   /* Determine ci and cj */
245   for (i=0; i<am; i++) {
246     anzi = Ai[i+1] - Ai[i];
247     cnzi = 0;
248     aj   = Aj + Ai[i];
249     for (j=0; j<anzi; j++){
250       brow = aj[j];
251       bnzj = bi[brow+1] - bi[brow];
252       bj   = Bj + bi[brow];
253       /* add non-zero cols of B into the condensed sorted linked list lnk */
254       /* ierr = PetscLLCondensedView(lnk_max,lnk_max,lnk,lnk);CHKERRQ(ierr); */
255       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
256     }
257     cnzi    = lnk[1];
258     ci[i+1] = ci[i] + cnzi;
259 
260     /* If free space is not available, make more free space */
261     /* Double the amount of total space in the list */
262     if (current_space->local_remaining<cnzi) {
263       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
264       ndouble++;
265     }
266 
267     /* Copy data into free space, then initialize lnk */
268     /* ierr = PetscLLCondensedView_fast(lnk);CHKERRQ(ierr);   */
269     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
270     /* PetscIntView(cnzi,current_space->array,0); */
271     current_space->array           += cnzi;
272     current_space->local_used      += cnzi;
273     current_space->local_remaining -= cnzi;
274   }
275 
276   /* Column indices are in the list of free space */
277   /* Allocate and copy column indices to cj, then destroy list of free space, lnk and bt */
278   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
279   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
280   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
281 
282   *Ci           = ci;
283   *Cj           = cj;
284   *nspacedouble = ndouble;
285   PetscFunctionReturn(0);
286 }
287 
288 #undef __FUNCT__
289 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
290 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
291 {
292   PetscErrorCode ierr;
293   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
294   PetscInt       *ai=a->i,*bi=b->i,*ci,*cj;
295   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N,nspacedouble;
296   MatScalar      *ca;
297   PetscReal      afill;
298 
299   PetscFunctionBegin;
300 #if defined(DEBUG_MATMATMULT)
301   ierr = PetscPrintf(PETSC_COMM_SELF,"MatMatMultSymbolic_SeqAIJ_SeqAIJ...\n");CHKERRQ(ierr);
302 #endif
303   /* Get ci and cj */
304   ierr = MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ(A,B,fill,&ci,&cj,&nspacedouble);CHKERRQ(ierr);
305 
306   /* Allocate space for ca */
307   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
308   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
309 
310   /* put together the new symbolic matrix */
311   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
312 
313   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
314   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
315   c = (Mat_SeqAIJ *)((*C)->data);
316   c->free_a   = PETSC_TRUE;
317   c->free_ij  = PETSC_TRUE;
318   c->nonew    = 0;
319   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
320   ierr = PetscMalloc(bn*sizeof(PetscScalar),&c->matmult_abdense);CHKERRQ(ierr);
321   ierr = PetscMemzero(c->matmult_abdense,bn*sizeof(PetscScalar));CHKERRQ(ierr);
322 
323   /* set MatInfo */
324   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
325   if (afill < 1.0) afill = 1.0;
326   c->maxnz                     = ci[am];
327   c->nz                        = ci[am];
328   (*C)->info.mallocs           = nspacedouble;
329   (*C)->info.fill_ratio_given  = fill;
330   (*C)->info.fill_ratio_needed = afill;
331 
332 #if defined(PETSC_USE_INFO)
333   if (ci[am]) {
334     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
335     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
336   } else {
337     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
338   }
339 #endif
340   PetscFunctionReturn(0);
341 }
342 
343 #undef __FUNCT__
344 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
345 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
346 {
347   PetscErrorCode ierr;
348   PetscLogDouble flops=0.0;
349   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
350   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
351   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
352   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
353   PetscInt       am=A->rmap->n,cm=C->rmap->n;
354   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
355   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
356   PetscScalar    *ab_dense=c->matmult_abdense;
357 
358   PetscFunctionBegin;
359   /* clean old values in C */
360   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
361   /* Traverse A row-wise. */
362   /* Build the ith row in C by summing over nonzero columns in A, */
363   /* the rows of B corresponding to nonzeros of A. */
364   for (i=0; i<am; i++) {
365     anzi = ai[i+1] - ai[i];
366     for (j=0; j<anzi; j++) {
367       brow = aj[j];
368       bnzi = bi[brow+1] - bi[brow];
369       bjj  = bj + bi[brow];
370       baj  = ba + bi[brow];
371       /* perform dense axpy */
372       valtmp = aa[j];
373       for (k=0; k<bnzi; k++) {
374         ab_dense[bjj[k]] += valtmp*baj[k];
375       }
376       flops += 2*bnzi;
377     }
378     aj += anzi; aa += anzi;
379 
380     cnzi = ci[i+1] - ci[i];
381     for (k=0; k<cnzi; k++) {
382       ca[k]          += ab_dense[cj[k]];
383       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
384     }
385     flops += cnzi;
386     cj += cnzi; ca += cnzi;
387   }
388   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
389   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
390   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
391   PetscFunctionReturn(0);
392 }
393 
394 #undef __FUNCT__
395 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
396 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
397 {
398   PetscErrorCode ierr;
399   PetscLogDouble flops=0.0;
400   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
401   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
402   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
403   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
404   PetscInt       am=A->rmap->N,cm=C->rmap->N;
405   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
406   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
407   PetscInt       nextb;
408 
409   PetscFunctionBegin;
410 #if defined(DEBUG_MATMATMULT)
411   ierr = PetscPrintf(PETSC_COMM_SELF,"MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable...\n");CHKERRQ(ierr);
412 #endif
413   /* clean old values in C */
414   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
415   /* Traverse A row-wise. */
416   /* Build the ith row in C by summing over nonzero columns in A, */
417   /* the rows of B corresponding to nonzeros of A. */
418   for (i=0;i<am;i++) {
419     anzi = ai[i+1] - ai[i];
420     cnzi = ci[i+1] - ci[i];
421     for (j=0;j<anzi;j++) {
422       brow = aj[j];
423       bnzi = bi[brow+1] - bi[brow];
424       bjj  = bj + bi[brow];
425       baj  = ba + bi[brow];
426       /* perform sparse axpy */
427       valtmp = aa[j];
428       nextb  = 0;
429       for (k=0; nextb<bnzi; k++) {
430         if (cj[k] == bjj[nextb]){ /* ccol == bcol */
431           ca[k] += valtmp*baj[nextb++];
432         }
433       }
434       flops += 2*bnzi;
435     }
436     aj += anzi; aa += anzi;
437     cj += cnzi; ca += cnzi;
438   }
439 
440   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
441   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
442   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
443   PetscFunctionReturn(0);
444 }
445 
446 #undef __FUNCT__
447 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_new"
448 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_new(Mat A,Mat B,PetscReal fill,Mat *C)
449 {
450   PetscErrorCode ierr;
451   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
452   PetscInt       *ai=a->i,*bi=b->i,*ci,*cj;
453   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N,nspacedouble;
454   MatScalar      *ca;
455   PetscReal      afill;
456 
457   PetscFunctionBegin;
458 #if defined(DEBUG_MATMATMULT)
459   ierr = PetscPrintf(PETSC_COMM_SELF,"MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable Armax %d, Brmax %d\n",a->rmax,b->rmax);CHKERRQ(ierr);
460 #endif
461   /* Get ci and cj */
462   ierr = MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable_new(A,B,fill,&ci,&cj,&nspacedouble);CHKERRQ(ierr);
463 
464   /* Allocate space for ca */
465   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
466   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
467 
468   /* put together the new symbolic matrix */
469   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
470 
471   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
472   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
473   c = (Mat_SeqAIJ *)((*C)->data);
474   c->free_a   = PETSC_TRUE;
475   c->free_ij  = PETSC_TRUE;
476   c->nonew    = 0;
477   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
478 
479   /* set MatInfo */
480   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
481   if (afill < 1.0) afill = 1.0;
482   c->maxnz                     = ci[am];
483   c->nz                        = ci[am];
484   (*C)->info.mallocs           = nspacedouble;
485   (*C)->info.fill_ratio_given  = fill;
486   (*C)->info.fill_ratio_needed = afill;
487 
488 #if defined(PETSC_USE_INFO)
489   if (ci[am]) {
490     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
491     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
492   } else {
493     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
494   }
495 #endif
496   PetscFunctionReturn(0);
497 }
498 
499 
500 #undef __FUNCT__
501 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
502 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
503 {
504   PetscErrorCode ierr;
505   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
506   PetscInt       *ai=a->i,*bi=b->i,*ci,*cj;
507   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N,nspacedouble;
508   MatScalar      *ca;
509   PetscReal      afill;
510 
511   PetscFunctionBegin;
512 #if defined(DEBUG_MATMATMULT)
513   ierr = PetscPrintf(PETSC_COMM_SELF,"MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable Armax %d, Brmax %d\n",a->rmax,b->rmax);CHKERRQ(ierr);
514 #endif
515   /* Get ci and cj */
516   ierr = MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ_Scalable(A,B,fill,&ci,&cj,&nspacedouble);CHKERRQ(ierr);
517 
518   /* Allocate space for ca */
519   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
520   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
521 
522   /* put together the new symbolic matrix */
523   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
524 
525   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
526   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
527   c = (Mat_SeqAIJ *)((*C)->data);
528   c->free_a   = PETSC_TRUE;
529   c->free_ij  = PETSC_TRUE;
530   c->nonew    = 0;
531   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
532 
533   /* set MatInfo */
534   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
535   if (afill < 1.0) afill = 1.0;
536   c->maxnz                     = ci[am];
537   c->nz                        = ci[am];
538   (*C)->info.mallocs           = nspacedouble;
539   (*C)->info.fill_ratio_given  = fill;
540   (*C)->info.fill_ratio_needed = afill;
541 
542 #if defined(PETSC_USE_INFO)
543   if (ci[am]) {
544     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
545     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
546   } else {
547     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
548   }
549 #endif
550   PetscFunctionReturn(0);
551 }
552 
553 
554 /* This routine is not used. Should be removed! */
555 #undef __FUNCT__
556 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
557 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
558 {
559   PetscErrorCode ierr;
560 
561   PetscFunctionBegin;
562   if (scall == MAT_INITIAL_MATRIX){
563     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
564   }
565   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
566   PetscFunctionReturn(0);
567 }
568 
569 #undef __FUNCT__
570 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
571 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
572 {
573   PetscErrorCode      ierr;
574   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
575 
576   PetscFunctionBegin;
577   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
578   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
579   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
580   ierr = PetscFree(multtrans);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 
584 #undef __FUNCT__
585 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
586 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
587 {
588   PetscErrorCode      ierr;
589   PetscContainer      container;
590   Mat_MatMatTransMult *multtrans=PETSC_NULL;
591 
592   PetscFunctionBegin;
593   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
594   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
595   ierr = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
596   A->ops->destroy   = multtrans->destroy;
597   if (A->ops->destroy) {
598     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
599   }
600   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
601   PetscFunctionReturn(0);
602 }
603 
604 #undef __FUNCT__
605 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
606 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
607 {
608   PetscErrorCode      ierr;
609   Mat                 Bt;
610   PetscInt            *bti,*btj;
611   Mat_MatMatTransMult *multtrans;
612   PetscContainer      container;
613   PetscLogDouble      t0,tf,etime2=0.0;
614 
615   PetscFunctionBegin;
616   ierr = PetscGetTime(&t0);CHKERRQ(ierr);
617    /* create symbolic Bt */
618   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
619   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);CHKERRQ(ierr);
620 
621   /* get symbolic C=A*Bt */
622   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
623 
624   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
625   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
626 
627   /* attach the supporting struct to C */
628   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
629   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
630   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
631   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
632   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
633 
634   multtrans->usecoloring = PETSC_FALSE;
635   multtrans->destroy = (*C)->ops->destroy;
636   (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
637 
638   ierr = PetscGetTime(&tf);CHKERRQ(ierr);
639   etime2 += tf - t0;
640 
641   ierr = PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);CHKERRQ(ierr);
642   if (multtrans->usecoloring){
643     /* Create MatTransposeColoring from symbolic C=A*B^T */
644     MatTransposeColoring matcoloring;
645     ISColoring           iscoloring;
646     Mat                  Bt_dense,C_dense;
647     PetscLogDouble       etime0=0.0,etime01=0.0,etime1=0.0;
648 
649     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
650     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
651     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
652     etime0 += tf - t0;
653 
654     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
655     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
656     multtrans->matcoloring = matcoloring;
657     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
658     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
659     etime01 += tf - t0;
660 
661     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
662     /* Create Bt_dense and C_dense = A*Bt_dense */
663     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
664     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
665     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
666     ierr = MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);CHKERRQ(ierr);
667     Bt_dense->assembled = PETSC_TRUE;
668     multtrans->Bt_den = Bt_dense;
669 
670     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
671     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
672     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
673     ierr = MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);CHKERRQ(ierr);
674     Bt_dense->assembled = PETSC_TRUE;
675     multtrans->ABt_den = C_dense;
676     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
677     etime1 += tf - t0;
678 
679 #if defined(PETSC_USE_INFO)
680     {
681     Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
682     ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
683     ierr = PetscInfo5(*C,"Sym = GetColor %g + ColorCreate %g + MatDenseCreate %g + non-colorSym %g = %g\n",etime0,etime01,etime1,etime2,etime0+etime01+etime1+etime2);
684     }
685 #endif
686   }
687   /* clean up */
688   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
689   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
690 
691 
692 
693 #if defined(INEFFICIENT_ALGORITHM)
694   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
695   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
696   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
697   PetscInt           *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
698   PetscInt           am=A->rmap->N,bm=B->rmap->N;
699   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
700   MatScalar          *ca;
701   PetscBT            lnkbt;
702   PetscReal          afill;
703 
704   /* Allocate row pointer array ci  */
705   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
706   ci[0] = 0;
707 
708   /* Create and initialize a linked list for C columns */
709   nlnk = bm+1;
710   ierr = PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
711 
712   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
713   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
714   current_space = free_space;
715 
716   /* Determine symbolic info for each row of the product A*B^T: */
717   for (i=0; i<am; i++) {
718     anzi = ai[i+1] - ai[i];
719     cnzi = 0;
720     acol = aj + ai[i];
721     for (j=0; j<bm; j++){
722       bnzj = bi[j+1] - bi[j];
723       bcol= bj + bi[j];
724       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
725       ka = 0; kb = 0;
726       while (ka < anzi && kb < bnzj){
727         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
728         if (ka == anzi) break;
729         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
730         if (kb == bnzj) break;
731         if (acol[ka] == bcol[kb]){ /* add nonzero c(i,j) to lnk */
732           index[0] = j;
733           ierr = PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
734           cnzi++;
735           break;
736         }
737       }
738     }
739 
740     /* If free space is not available, make more free space */
741     /* Double the amount of total space in the list */
742     if (current_space->local_remaining<cnzi) {
743       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
744       nspacedouble++;
745     }
746 
747     /* Copy data into free space, then initialize lnk */
748     ierr = PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
749     current_space->array           += cnzi;
750     current_space->local_used      += cnzi;
751     current_space->local_remaining -= cnzi;
752 
753     ci[i+1] = ci[i] + cnzi;
754   }
755 
756 
757   /* Column indices are in the list of free space.
758      Allocate array cj, copy column indices to cj, and destroy list of free space */
759   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
760   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
761   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
762 
763   /* Allocate space for ca */
764   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
765   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
766 
767   /* put together the new symbolic matrix */
768   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);CHKERRQ(ierr);
769 
770   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
771   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
772   c = (Mat_SeqAIJ *)((*C)->data);
773   c->free_a   = PETSC_TRUE;
774   c->free_ij  = PETSC_TRUE;
775   c->nonew    = 0;
776 
777   /* set MatInfo */
778   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
779   if (afill < 1.0) afill = 1.0;
780   c->maxnz                     = ci[am];
781   c->nz                        = ci[am];
782   (*C)->info.mallocs           = nspacedouble;
783   (*C)->info.fill_ratio_given  = fill;
784   (*C)->info.fill_ratio_needed = afill;
785 
786 #if defined(PETSC_USE_INFO)
787   if (ci[am]) {
788     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
789     ierr = PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
790   } else {
791     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
792   }
793 #endif
794 #endif
795   PetscFunctionReturn(0);
796 }
797 
798 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
799 #undef __FUNCT__
800 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
801 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
802 {
803   PetscErrorCode ierr;
804   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
805   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
806   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
807   PetscLogDouble flops=0.0;
808   MatScalar      *aa=a->a,*aval,*ba=b->a,*bval,*ca=c->a,*cval;
809   Mat_MatMatTransMult *multtrans;
810   PetscContainer      container;
811 #if defined(USE_ARRAY)
812   MatScalar      *spdot;
813 #endif
814 
815   PetscFunctionBegin;
816   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
817   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
818   ierr  = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
819   if (multtrans->usecoloring){
820     MatTransposeColoring  matcoloring = multtrans->matcoloring;
821     Mat                   Bt_dense;
822     PetscInt              m,n;
823     PetscLogDouble t0,tf,etime0=0.0,etime1=0.0,etime2=0.0;
824     Mat C_dense = multtrans->ABt_den;
825 
826     Bt_dense = multtrans->Bt_den;
827     ierr = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
828 
829     /* Get Bt_dense by Apply MatTransposeColoring to B */
830     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
831     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
832     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
833     etime0 += tf - t0;
834 
835     /* C_dense = A*Bt_dense */
836     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
837     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
838     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
839     etime2 += tf - t0;
840 
841     /* Recover C from C_dense */
842     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
843     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
844     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
845     etime1 += tf - t0;
846 #if defined(PETSC_USE_INFO)
847     ierr = PetscInfo4(C,"Num = ColoringApply: %g %g + Mult_sp_dense %g = %g\n",etime0,etime1,etime2,etime0+etime1+etime2);
848 #endif
849     PetscFunctionReturn(0);
850   }
851 
852 #if defined(USE_ARRAY)
853   /* allocate an array for implementing sparse inner-product */
854   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
855   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
856 #endif
857 
858   /* clear old values in C */
859   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
860 
861   for (i=0; i<cm; i++) {
862     anzi = ai[i+1] - ai[i];
863     acol = aj + ai[i];
864     aval = aa + ai[i];
865     cnzi = ci[i+1] - ci[i];
866     ccol = cj + ci[i];
867     cval = ca + ci[i];
868     for (j=0; j<cnzi; j++){
869       brow = ccol[j];
870       bnzj = bi[brow+1] - bi[brow];
871       bcol = bj + bi[brow];
872       bval = ba + bi[brow];
873 
874       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
875 #if defined(USE_ARRAY)
876       /* put ba to spdot array */
877       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
878       /* c(i,j)=A[i,:]*B[j,:]^T */
879       for (nexta=0; nexta<anzi; nexta++){
880         cval[j] += spdot[acol[nexta]]*aval[nexta];
881       }
882       /* zero spdot array */
883       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
884 #else
885       nexta = 0; nextb = 0;
886       while (nexta<anzi && nextb<bnzj){
887         while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
888         if (nexta == anzi) break;
889         while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
890         if (nextb == bnzj) break;
891         if (acol[nexta] == bcol[nextb]){
892           cval[j] += aval[nexta]*bval[nextb];
893           nexta++; nextb++;
894           flops += 2;
895         }
896       }
897 #endif
898     }
899   }
900   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
901   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
902   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
903 #if defined(USE_ARRAY)
904   ierr = PetscFree(spdot);
905 #endif
906   PetscFunctionReturn(0);
907 }
908 
909 #undef __FUNCT__
910 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
911 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) {
912   PetscErrorCode ierr;
913 
914   PetscFunctionBegin;
915   if (scall == MAT_INITIAL_MATRIX){
916     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
917   }
918   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
919   PetscFunctionReturn(0);
920 }
921 
922 #undef __FUNCT__
923 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
924 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
925 {
926   PetscErrorCode ierr;
927   Mat            At;
928   PetscInt       *ati,*atj;
929 
930   PetscFunctionBegin;
931   /* create symbolic At */
932   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
933   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);CHKERRQ(ierr);
934 
935   /* get symbolic C=At*B */
936   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
937 
938   /* clean up */
939   ierr = MatDestroy(&At);CHKERRQ(ierr);
940   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
941   PetscFunctionReturn(0);
942 }
943 
944 #undef __FUNCT__
945 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
946 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
947 {
948   PetscErrorCode ierr;
949   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
950   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
951   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
952   PetscLogDouble flops=0.0;
953   MatScalar      *aa=a->a,*ba,*ca=c->a,*caj;
954 
955   PetscFunctionBegin;
956   /* clear old values in C */
957   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
958 
959   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
960   for (i=0;i<am;i++) {
961     bj   = b->j + bi[i];
962     ba   = b->a + bi[i];
963     bnzi = bi[i+1] - bi[i];
964     anzi = ai[i+1] - ai[i];
965     for (j=0; j<anzi; j++) {
966       nextb = 0;
967       crow  = *aj++;
968       cjj   = cj + ci[crow];
969       caj   = ca + ci[crow];
970       /* perform sparse axpy operation.  Note cjj includes bj. */
971       for (k=0; nextb<bnzi; k++) {
972         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
973           caj[k] += (*aa)*(*(ba+nextb));
974           nextb++;
975         }
976       }
977       flops += 2*bnzi;
978       aa++;
979     }
980   }
981 
982   /* Assemble the final matrix and clean up */
983   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
984   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
985   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
986   PetscFunctionReturn(0);
987 }
988 
989 EXTERN_C_BEGIN
990 #undef __FUNCT__
991 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
992 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
993 {
994   PetscErrorCode ierr;
995 
996   PetscFunctionBegin;
997   if (scall == MAT_INITIAL_MATRIX){
998     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
999   }
1000   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1001   PetscFunctionReturn(0);
1002 }
1003 EXTERN_C_END
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1007 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1008 {
1009   PetscErrorCode ierr;
1010 
1011   PetscFunctionBegin;
1012   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1013   (*C)->ops->matmult = MatMatMult_SeqAIJ_SeqDense;
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNCT__
1018 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1019 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1020 {
1021   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1022   PetscErrorCode ierr;
1023   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1024   MatScalar      *aa;
1025   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
1026   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
1027 
1028   PetscFunctionBegin;
1029   if (!cm || !cn) PetscFunctionReturn(0);
1030   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1031   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1032   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1033   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
1034   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
1035   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1036   for (col=0; col<cn-4; col += 4){  /* over columns of C */
1037     colam = col*am;
1038     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1039       r1 = r2 = r3 = r4 = 0.0;
1040       n   = a->i[i+1] - a->i[i];
1041       aj  = a->j + a->i[i];
1042       aa  = a->a + a->i[i];
1043       for (j=0; j<n; j++) {
1044         r1 += (*aa)*b1[*aj];
1045         r2 += (*aa)*b2[*aj];
1046         r3 += (*aa)*b3[*aj];
1047         r4 += (*aa++)*b4[*aj++];
1048       }
1049       c[colam + i]       = r1;
1050       c[colam + am + i]  = r2;
1051       c[colam + am2 + i] = r3;
1052       c[colam + am3 + i] = r4;
1053     }
1054     b1 += bm4;
1055     b2 += bm4;
1056     b3 += bm4;
1057     b4 += bm4;
1058   }
1059   for (;col<cn; col++){     /* over extra columns of C */
1060     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1061       r1 = 0.0;
1062       n   = a->i[i+1] - a->i[i];
1063       aj  = a->j + a->i[i];
1064       aa  = a->a + a->i[i];
1065 
1066       for (j=0; j<n; j++) {
1067         r1 += (*aa++)*b1[*aj++];
1068       }
1069       c[col*am + i]     = r1;
1070     }
1071     b1 += bm;
1072   }
1073   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1074   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
1075   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
1076   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1077   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 /*
1082    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1083 */
1084 #undef __FUNCT__
1085 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1086 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1087 {
1088   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1089   PetscErrorCode ierr;
1090   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1091   MatScalar      *aa;
1092   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1093   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1094 
1095   PetscFunctionBegin;
1096   if (!cm || !cn) PetscFunctionReturn(0);
1097   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
1098   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
1099   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1100 
1101   if (a->compressedrow.use){ /* use compressed row format */
1102     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1103       colam = col*am;
1104       arm   = a->compressedrow.nrows;
1105       ii    = a->compressedrow.i;
1106       ridx  = a->compressedrow.rindex;
1107       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1108 	r1 = r2 = r3 = r4 = 0.0;
1109 	n   = ii[i+1] - ii[i];
1110 	aj  = a->j + ii[i];
1111 	aa  = a->a + ii[i];
1112 	for (j=0; j<n; j++) {
1113 	  r1 += (*aa)*b1[*aj];
1114 	  r2 += (*aa)*b2[*aj];
1115 	  r3 += (*aa)*b3[*aj];
1116 	  r4 += (*aa++)*b4[*aj++];
1117 	}
1118 	c[colam       + ridx[i]] += r1;
1119 	c[colam + am  + ridx[i]] += r2;
1120 	c[colam + am2 + ridx[i]] += r3;
1121 	c[colam + am3 + ridx[i]] += r4;
1122       }
1123       b1 += bm4;
1124       b2 += bm4;
1125       b3 += bm4;
1126       b4 += bm4;
1127     }
1128     for (;col<cn; col++){     /* over extra columns of C */
1129       colam = col*am;
1130       arm   = a->compressedrow.nrows;
1131       ii    = a->compressedrow.i;
1132       ridx  = a->compressedrow.rindex;
1133       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1134 	r1 = 0.0;
1135 	n   = ii[i+1] - ii[i];
1136 	aj  = a->j + ii[i];
1137 	aa  = a->a + ii[i];
1138 
1139 	for (j=0; j<n; j++) {
1140 	  r1 += (*aa++)*b1[*aj++];
1141 	}
1142 	c[col*am + ridx[i]] += r1;
1143       }
1144       b1 += bm;
1145     }
1146   } else {
1147     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1148       colam = col*am;
1149       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1150 	r1 = r2 = r3 = r4 = 0.0;
1151 	n   = a->i[i+1] - a->i[i];
1152 	aj  = a->j + a->i[i];
1153 	aa  = a->a + a->i[i];
1154 	for (j=0; j<n; j++) {
1155 	  r1 += (*aa)*b1[*aj];
1156 	  r2 += (*aa)*b2[*aj];
1157 	  r3 += (*aa)*b3[*aj];
1158 	  r4 += (*aa++)*b4[*aj++];
1159 	}
1160 	c[colam + i]       += r1;
1161 	c[colam + am + i]  += r2;
1162 	c[colam + am2 + i] += r3;
1163 	c[colam + am3 + i] += r4;
1164       }
1165       b1 += bm4;
1166       b2 += bm4;
1167       b3 += bm4;
1168       b4 += bm4;
1169     }
1170     for (;col<cn; col++){     /* over extra columns of C */
1171       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1172 	r1 = 0.0;
1173 	n   = a->i[i+1] - a->i[i];
1174 	aj  = a->j + a->i[i];
1175 	aa  = a->a + a->i[i];
1176 
1177 	for (j=0; j<n; j++) {
1178 	  r1 += (*aa++)*b1[*aj++];
1179 	}
1180 	c[col*am + i]     += r1;
1181       }
1182       b1 += bm;
1183     }
1184   }
1185   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1186   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
1187   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 #undef __FUNCT__
1192 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1193 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1194 {
1195   PetscErrorCode ierr;
1196   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1197   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1198   PetscInt       *bi=b->i,*bj=b->j;
1199   PetscInt       m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1200   MatScalar      *btval,*btval_den,*ba=b->a;
1201   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1202 
1203   PetscFunctionBegin;
1204   btval_den=btdense->v;
1205   ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1206   for (k=0; k<ncolors; k++) {
1207     ncolumns = coloring->ncolumns[k];
1208     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1209       col   = *(columns + colorforcol[k] + l);
1210       btcol = bj + bi[col];
1211       btval = ba + bi[col];
1212       anz   = bi[col+1] - bi[col];
1213       for (j=0; j<anz; j++){
1214         brow            = btcol[j];
1215         btval_den[brow] = btval[j];
1216       }
1217     }
1218     btval_den += m;
1219   }
1220   PetscFunctionReturn(0);
1221 }
1222 
1223 #undef __FUNCT__
1224 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1225 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1226 {
1227   PetscErrorCode ierr;
1228   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1229   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1230   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1231   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1232 
1233   PetscFunctionBegin;
1234   ierr = MatGetLocalSize(Csp,&m,PETSC_NULL);CHKERRQ(ierr);
1235   ierr = MatGetArray(Cden,&ca_den);CHKERRQ(ierr);
1236   cp_den = ca_den;
1237   for (k=0; k<ncolors; k++) {
1238     nrows = matcoloring->nrows[k];
1239     row   = rows  + colorforrow[k];
1240     idx   = spidx + colorforrow[k];
1241     for (l=0; l<nrows; l++){
1242       ca[idx[l]] = cp_den[row[l]];
1243     }
1244     cp_den += m;
1245   }
1246   ierr = MatRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1247   PetscFunctionReturn(0);
1248 }
1249 
1250 /*
1251  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1252  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1253  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1254  */
1255 #undef __FUNCT__
1256 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
1257 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *nn,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1258 {
1259   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1260   PetscErrorCode ierr;
1261   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1262   PetscInt       nz = a->i[m],row,*jj,mr,col;
1263   PetscInt       *cspidx;
1264 
1265   PetscFunctionBegin;
1266   *nn = n;
1267   if (!ia) PetscFunctionReturn(0);
1268   if (symmetric) {
1269     SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1270     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
1271   } else {
1272     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
1273     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1274     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
1275     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
1276     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
1277     jj = a->j;
1278     for (i=0; i<nz; i++) {
1279       collengths[jj[i]]++;
1280     }
1281     cia[0] = oshift;
1282     for (i=0; i<n; i++) {
1283       cia[i+1] = cia[i] + collengths[i];
1284     }
1285     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1286     jj   = a->j;
1287     for (row=0; row<m; row++) {
1288       mr = a->i[row+1] - a->i[row];
1289       for (i=0; i<mr; i++) {
1290         col = *jj++;
1291         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1292         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1293       }
1294     }
1295     ierr = PetscFree(collengths);CHKERRQ(ierr);
1296     *ia = cia; *ja = cja;
1297     *spidx = cspidx;
1298   }
1299   PetscFunctionReturn(0);
1300 }
1301 
1302 #undef __FUNCT__
1303 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1304 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1305 {
1306   PetscErrorCode ierr;
1307 
1308   PetscFunctionBegin;
1309   if (!ia) PetscFunctionReturn(0);
1310 
1311   ierr = PetscFree(*ia);CHKERRQ(ierr);
1312   ierr = PetscFree(*ja);CHKERRQ(ierr);
1313   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 #undef __FUNCT__
1318 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1319 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1320 {
1321   PetscErrorCode ierr;
1322   PetscInt       i,n,nrows,N,j,k,m,*row_idx,*ci,*cj,ncols,col,cm;
1323   const PetscInt *is;
1324   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1325   IS             *isa;
1326   PetscBool      done;
1327   PetscBool      flg1,flg2;
1328   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1329   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1330   PetscInt       *colorforcol,*columns,*columns_i;
1331 
1332   PetscFunctionBegin;
1333   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1334 
1335   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1336   ierr = PetscTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1337   ierr = PetscTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1338   if (flg1 || flg2) {
1339     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1340   }
1341 
1342   N         = mat->cmap->N/bs;
1343   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1344   c->N      = mat->cmap->N/bs;
1345   c->m      = mat->rmap->N/bs;
1346   c->rstart = 0;
1347 
1348   c->ncolors = nis;
1349   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1350   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1351   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1352   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1353   colorforrow[0]    = 0;
1354   rows_i            = rows;
1355   columnsforspidx_i = columnsforspidx;
1356 
1357   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1358   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1359   colorforcol[0] = 0;
1360   columns_i      = columns;
1361 
1362   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr); /* column-wise storage of mat */
1363   if (!done) SETERRQ1(((PetscObject)mat)->comm,PETSC_ERR_SUP,"MatGetColumnIJ() not supported for matrix type %s",((PetscObject)mat)->type_name);
1364 
1365   cm = c->m;
1366   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1367   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1368   for (i=0; i<nis; i++) {
1369     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1370     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1371     c->ncolumns[i] = n;
1372     if (n) {
1373       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1374     }
1375     colorforcol[i+1] = colorforcol[i] + n;
1376     columns_i       += n;
1377 
1378     /* fast, crude version requires O(N*N) work */
1379     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1380 
1381     /* loop over columns*/
1382     for (j=0; j<n; j++) {
1383       col     = is[j];
1384       row_idx = cj + ci[col];
1385       m       = ci[col+1] - ci[col];
1386       /* loop over columns marking them in rowhit */
1387       for (k=0; k<m; k++) {
1388         idxhit[*row_idx]   = spidx[ci[col] + k];
1389         rowhit[*row_idx++] = col + 1;
1390       }
1391     }
1392     /* count the number of hits */
1393     nrows = 0;
1394     for (j=0; j<cm; j++) {
1395       if (rowhit[j]) nrows++;
1396     }
1397     c->nrows[i]      = nrows;
1398     colorforrow[i+1] = colorforrow[i] + nrows;
1399 
1400     nrows       = 0;
1401     for (j=0; j<cm; j++) {
1402       if (rowhit[j]) {
1403         rows_i[nrows]            = j;
1404         columnsforspidx_i[nrows] = idxhit[j];
1405         nrows++;
1406       }
1407     }
1408     ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1409     rows_i += nrows; columnsforspidx_i += nrows;
1410   }
1411   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr);
1412   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1413   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1414 #if defined(PETSC_USE_DEBUG)
1415   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1416 #endif
1417 
1418   c->colorforrow     = colorforrow;
1419   c->rows            = rows;
1420   c->columnsforspidx = columnsforspidx;
1421   c->colorforcol     = colorforcol;
1422   c->columns         = columns;
1423   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1424   PetscFunctionReturn(0);
1425 }
1426