xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 7660c4db30226b837c824f5d7ea0156d5e82b17c)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith  /*
32499ec78SHong Zhang    Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith            C = A * B
5d50806bdSBarry Smith  */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown  #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown  #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown  #include <petscbt.h>
10af0996ceSBarry Smith  #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith  #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
1358cf0668SJed Brown  static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*);
14cd093f1dSJed Brown 
155e5acdf2Sstefano_zampini  #if defined(PETSC_HAVE_HYPRE)
165e5acdf2Sstefano_zampini  PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
175e5acdf2Sstefano_zampini  #endif
185e5acdf2Sstefano_zampini 
19150d2497SBarry Smith  PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
2038baddfdSBarry Smith  {
21dfbe8321SBarry Smith    PetscErrorCode ierr;
225e5acdf2Sstefano_zampini  #if !defined(PETSC_HAVE_HYPRE)
23d7ed1a4dSandi selinger    const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"};
24d013fc79Sandi selinger    PetscInt       nalg = 8;
25d7ed1a4dSandi selinger  #else
26d7ed1a4dSandi selinger    const char     *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"};
27d7ed1a4dSandi selinger    PetscInt       nalg = 9;
285e5acdf2Sstefano_zampini  #endif
296540a9cdSHong Zhang    PetscInt       alg = 0; /* set default algorithm */
30d013fc79Sandi selinger    PetscBool      combined = PETSC_FALSE;  /* Indicates whether the symbolic stage already computed the numerical values. */
315c66b693SKris Buschelman 
325c66b693SKris Buschelman    PetscFunctionBegin;
3326be0446SHong Zhang    if (scall == MAT_INITIAL_MATRIX) {
34152983bfSHong Zhang      ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
3568eacb73SHong Zhang      PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
365e5acdf2Sstefano_zampini      ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr);
37d8bbc50fSBarry Smith      ierr = PetscOptionsEnd();CHKERRQ(ierr);
383ff4c91cSHong Zhang      ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
396540a9cdSHong Zhang      switch (alg) {
406540a9cdSHong Zhang      case 1:
41aacf854cSBarry Smith        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
426540a9cdSHong Zhang        break;
436540a9cdSHong Zhang      case 2:
446540a9cdSHong Zhang        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
456540a9cdSHong Zhang        break;
466540a9cdSHong Zhang      case 3:
470ced3a2bSJed Brown        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
486540a9cdSHong Zhang        break;
496540a9cdSHong Zhang      case 4:
508a07c6f1SJed Brown        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
516540a9cdSHong Zhang        break;
526540a9cdSHong Zhang      case 5:
5358cf0668SJed Brown        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
546540a9cdSHong Zhang        break;
555e5acdf2Sstefano_zampini      case 6:
56d013fc79Sandi selinger        ierr = MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);CHKERRQ(ierr);
57d013fc79Sandi selinger        combined = PETSC_TRUE;
58d013fc79Sandi selinger        break;
59d013fc79Sandi selinger     case 7:
60d7ed1a4dSandi selinger        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
61d7ed1a4dSandi selinger        break;
62d7ed1a4dSandi selinger  #if defined(PETSC_HAVE_HYPRE)
63d7ed1a4dSandi selinger      case 8:
645e5acdf2Sstefano_zampini        ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
655e5acdf2Sstefano_zampini        break;
665e5acdf2Sstefano_zampini  #endif
676540a9cdSHong Zhang      default:
6826be0446SHong Zhang        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
696540a9cdSHong Zhang       break;
7025023028SHong Zhang      }
713ff4c91cSHong Zhang      ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
7226be0446SHong Zhang    }
735c913ed7SHong Zhang 
743ff4c91cSHong Zhang    ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
75d013fc79Sandi selinger    if (!combined) {
7601e47db4SHong Zhang      ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
77d013fc79Sandi selinger    }
783ff4c91cSHong Zhang    ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
795c66b693SKris Buschelman    PetscFunctionReturn(0);
805c66b693SKris Buschelman  }
811c24bd37SHong Zhang 
8258cf0668SJed Brown  static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
83b561aa9dSHong Zhang  {
84b561aa9dSHong Zhang    PetscErrorCode     ierr;
85b561aa9dSHong Zhang    Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
86971236abSHong Zhang    PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
87c1ba5575SJed Brown    PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
88b561aa9dSHong Zhang    PetscReal          afill;
89eca6b86aSHong Zhang    PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
90eca6b86aSHong Zhang    PetscTable         ta;
91fb908031SHong Zhang    PetscBT            lnkbt;
920298fd71SBarry Smith    PetscFreeSpaceList free_space=NULL,current_space=NULL;
93b561aa9dSHong Zhang 
94b561aa9dSHong Zhang    PetscFunctionBegin;
95bd958071SHong Zhang    /* Get ci and cj */
96bd958071SHong Zhang    /*---------------*/
97fb908031SHong Zhang    /* Allocate ci array, arrays for fill computation and */
98fb908031SHong Zhang    /* free space for accumulating nonzero column info */
99854ce69bSBarry Smith    ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
100fb908031SHong Zhang    ci[0] = 0;
101fb908031SHong Zhang 
102fb908031SHong Zhang    /* create and initialize a linked list */
103c373ccc6SHong Zhang    ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
104c373ccc6SHong Zhang    MatRowMergeMax_SeqAIJ(b,bm,ta);
105eca6b86aSHong Zhang    ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
106eca6b86aSHong Zhang    ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
107eca6b86aSHong Zhang 
108eca6b86aSHong Zhang    ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
109fb908031SHong Zhang 
110fb908031SHong Zhang    /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
111f91af8c7SBarry Smith    ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1122205254eSKarl Rupp 
113fb908031SHong Zhang    current_space = free_space;
114fb908031SHong Zhang 
115fb908031SHong Zhang    /* Determine ci and cj */
116fb908031SHong Zhang    for (i=0; i<am; i++) {
117fb908031SHong Zhang      anzi = ai[i+1] - ai[i];
118fb908031SHong Zhang      aj   = a->j + ai[i];
119fb908031SHong Zhang      for (j=0; j<anzi; j++) {
120fb908031SHong Zhang        brow = aj[j];
121fb908031SHong Zhang        bnzj = bi[brow+1] - bi[brow];
122fb908031SHong Zhang        bj   = b->j + bi[brow];
123fb908031SHong Zhang        /* add non-zero cols of B into the sorted linked list lnk */
124fb908031SHong Zhang        ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
125fb908031SHong Zhang      }
126fb908031SHong Zhang      cnzi = lnk[0];
127fb908031SHong Zhang 
128fb908031SHong Zhang      /* If free space is not available, make more free space */
129fb908031SHong Zhang      /* Double the amount of total space in the list */
130fb908031SHong Zhang      if (current_space->local_remaining<cnzi) {
131f91af8c7SBarry Smith        ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
132fb908031SHong Zhang        ndouble++;
133fb908031SHong Zhang      }
134fb908031SHong Zhang 
135fb908031SHong Zhang      /* Copy data into free space, then initialize lnk */
136fb908031SHong Zhang      ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1372205254eSKarl Rupp 
138fb908031SHong Zhang      current_space->array           += cnzi;
139fb908031SHong Zhang      current_space->local_used      += cnzi;
140fb908031SHong Zhang      current_space->local_remaining -= cnzi;
1412205254eSKarl Rupp 
142fb908031SHong Zhang      ci[i+1] = ci[i] + cnzi;
143fb908031SHong Zhang    }
144fb908031SHong Zhang 
145fb908031SHong Zhang    /* Column indices are in the list of free space */
146fb908031SHong Zhang    /* Allocate space for cj, initialize cj, and */
147fb908031SHong Zhang    /* destroy list of free space and other temporary array(s) */
148854ce69bSBarry Smith    ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
149fb908031SHong Zhang    ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
150fb908031SHong Zhang    ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
151b561aa9dSHong Zhang 
15226be0446SHong Zhang    /* put together the new symbolic matrix */
153ce94432eSBarry Smith    ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
15433d57670SJed Brown    ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
15502fe1965SBarry Smith    ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
15658c24d83SHong Zhang 
15758c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
15858c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
15958c24d83SHong Zhang   c                         = (Mat_SeqAIJ*)((*C)->data);
160aa1aec99SHong Zhang   c->free_a                 = PETSC_FALSE;
161e6b907acSBarry Smith   c->free_ij                = PETSC_TRUE;
16258c24d83SHong Zhang   c->nonew                  = 0;
163002e8affSHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
1640b7e3e3dSHong Zhang 
1657212da7cSHong Zhang   /* set MatInfo */
1667212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
167f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
1687212da7cSHong Zhang   c->maxnz                     = ci[am];
1697212da7cSHong Zhang   c->nz                        = ci[am];
170fb908031SHong Zhang   (*C)->info.mallocs           = ndouble;
1717212da7cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
1727212da7cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
1737212da7cSHong Zhang 
1747212da7cSHong Zhang #if defined(PETSC_USE_INFO)
1757212da7cSHong Zhang   if (ci[am]) {
17657622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
17757622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
178f2b054eeSHong Zhang   } else {
179f2b054eeSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
180be0fcf8dSHong Zhang   }
181f2b054eeSHong Zhang #endif
18258c24d83SHong Zhang   PetscFunctionReturn(0);
18358c24d83SHong Zhang }
184d50806bdSBarry Smith 
185dfbe8321SBarry Smith PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
186d50806bdSBarry Smith {
187dfbe8321SBarry Smith   PetscErrorCode ierr;
188d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
189d50806bdSBarry Smith   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
190d50806bdSBarry Smith   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
191d50806bdSBarry Smith   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
19238baddfdSBarry Smith   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
193c58ca2e3SHong Zhang   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
194519eb980SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
195aa1aec99SHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
196aa1aec99SHong Zhang   PetscScalar    *ab_dense;
197d50806bdSBarry Smith 
198d50806bdSBarry Smith   PetscFunctionBegin;
199aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
200854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
201aa1aec99SHong Zhang     c->a      = ca;
202aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
203aa1aec99SHong Zhang   } else {
204aa1aec99SHong Zhang     ca        = c->a;
205d90d86d0SMatthew G. Knepley   }
206d90d86d0SMatthew G. Knepley   if (!c->matmult_abdense) {
2071795a4d1SJed Brown     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
208d90d86d0SMatthew G. Knepley     c->matmult_abdense = ab_dense;
209d90d86d0SMatthew G. Knepley   } else {
210aa1aec99SHong Zhang     ab_dense = c->matmult_abdense;
211aa1aec99SHong Zhang   }
212aa1aec99SHong Zhang 
213c124e916SHong Zhang   /* clean old values in C */
214c124e916SHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
215d50806bdSBarry Smith   /* Traverse A row-wise. */
216d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
217d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
218d50806bdSBarry Smith   for (i=0; i<am; i++) {
219d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
220d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
221519eb980SHong Zhang       brow = aj[j];
222d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
223d50806bdSBarry Smith       bjj  = bj + bi[brow];
224d50806bdSBarry Smith       baj  = ba + bi[brow];
225519eb980SHong Zhang       /* perform dense axpy */
22636ec6d2dSHong Zhang       valtmp = aa[j];
227519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
22836ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
229519eb980SHong Zhang       }
230519eb980SHong Zhang       flops += 2*bnzi;
231519eb980SHong Zhang     }
232c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
233c58ca2e3SHong Zhang 
234c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
235519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
236519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
237519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
238519eb980SHong Zhang     }
239519eb980SHong Zhang     flops += cnzi;
240519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
241519eb980SHong Zhang   }
242c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
243c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
244c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
245c58ca2e3SHong Zhang   PetscFunctionReturn(0);
246c58ca2e3SHong Zhang }
247c58ca2e3SHong Zhang 
24825023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
249c58ca2e3SHong Zhang {
250c58ca2e3SHong Zhang   PetscErrorCode ierr;
251c58ca2e3SHong Zhang   PetscLogDouble flops=0.0;
252c58ca2e3SHong Zhang   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
253c58ca2e3SHong Zhang   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
254c58ca2e3SHong Zhang   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
255c58ca2e3SHong Zhang   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
256c58ca2e3SHong Zhang   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
257c58ca2e3SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
25836ec6d2dSHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
259c58ca2e3SHong Zhang   PetscInt       nextb;
260c58ca2e3SHong Zhang 
261c58ca2e3SHong Zhang   PetscFunctionBegin;
262cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
263cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
264cf742fccSHong Zhang     c->a      = ca;
265cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
266cf742fccSHong Zhang   }
267cf742fccSHong Zhang 
268c58ca2e3SHong Zhang   /* clean old values in C */
269c58ca2e3SHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
270c58ca2e3SHong Zhang   /* Traverse A row-wise. */
271c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
272c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
273519eb980SHong Zhang   for (i=0; i<am; i++) {
274519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
275519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
276519eb980SHong Zhang     for (j=0; j<anzi; j++) {
277519eb980SHong Zhang       brow = aj[j];
278519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
279519eb980SHong Zhang       bjj  = bj + bi[brow];
280519eb980SHong Zhang       baj  = ba + bi[brow];
281519eb980SHong Zhang       /* perform sparse axpy */
28236ec6d2dSHong Zhang       valtmp = aa[j];
283c124e916SHong Zhang       nextb  = 0;
284c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
285c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
28636ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
287c124e916SHong Zhang         }
288d50806bdSBarry Smith       }
289d50806bdSBarry Smith       flops += 2*bnzi;
290d50806bdSBarry Smith     }
291519eb980SHong Zhang     aj += anzi; aa += anzi;
292519eb980SHong Zhang     cj += cnzi; ca += cnzi;
293d50806bdSBarry Smith   }
294c58ca2e3SHong Zhang 
295716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
296716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
297d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
298d50806bdSBarry Smith   PetscFunctionReturn(0);
299d50806bdSBarry Smith }
300bc011b1eSHong Zhang 
3013c50cad2SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
30225296bd5SBarry Smith {
30325296bd5SBarry Smith   PetscErrorCode     ierr;
30425296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
30525296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3063c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
30725296bd5SBarry Smith   MatScalar          *ca;
30825296bd5SBarry Smith   PetscReal          afill;
309eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
310eca6b86aSHong Zhang   PetscTable         ta;
3110298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
31225296bd5SBarry Smith 
31325296bd5SBarry Smith   PetscFunctionBegin;
3143c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3153c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3163c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
317854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
3183c50cad2SHong Zhang   ci[0] = 0;
3193c50cad2SHong Zhang 
3203c50cad2SHong Zhang   /* create and initialize a linked list */
321c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
322c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
323eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
324eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
325eca6b86aSHong Zhang 
326eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
3273c50cad2SHong Zhang 
3283c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
329f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
3303c50cad2SHong Zhang   current_space = free_space;
3313c50cad2SHong Zhang 
3323c50cad2SHong Zhang   /* Determine ci and cj */
3333c50cad2SHong Zhang   for (i=0; i<am; i++) {
3343c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
3353c50cad2SHong Zhang     aj   = a->j + ai[i];
3363c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
3373c50cad2SHong Zhang       brow = aj[j];
3383c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
3393c50cad2SHong Zhang       bj   = b->j + bi[brow];
3403c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
3413c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
3423c50cad2SHong Zhang     }
3433c50cad2SHong Zhang     cnzi = lnk[1];
3443c50cad2SHong Zhang 
3453c50cad2SHong Zhang     /* If free space is not available, make more free space */
3463c50cad2SHong Zhang     /* Double the amount of total space in the list */
3473c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
348f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
3493c50cad2SHong Zhang       ndouble++;
3503c50cad2SHong Zhang     }
3513c50cad2SHong Zhang 
3523c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
3533c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
3542205254eSKarl Rupp 
3553c50cad2SHong Zhang     current_space->array           += cnzi;
3563c50cad2SHong Zhang     current_space->local_used      += cnzi;
3573c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
3582205254eSKarl Rupp 
3593c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
3603c50cad2SHong Zhang   }
3613c50cad2SHong Zhang 
3623c50cad2SHong Zhang   /* Column indices are in the list of free space */
3633c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
3643c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
365854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
3663c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
3673c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
36825296bd5SBarry Smith 
36925296bd5SBarry Smith   /* Allocate space for ca */
370854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
37125296bd5SBarry Smith   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
37225296bd5SBarry Smith 
37325296bd5SBarry Smith   /* put together the new symbolic matrix */
374ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
37533d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
37602fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
37725296bd5SBarry Smith 
37825296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
37925296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
38025296bd5SBarry Smith   c          = (Mat_SeqAIJ*)((*C)->data);
38125296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
38225296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
38325296bd5SBarry Smith   c->nonew   = 0;
3842205254eSKarl Rupp 
38525296bd5SBarry Smith   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
38625296bd5SBarry Smith 
38725296bd5SBarry Smith   /* set MatInfo */
38825296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
38925296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
39025296bd5SBarry Smith   c->maxnz                     = ci[am];
39125296bd5SBarry Smith   c->nz                        = ci[am];
3923c50cad2SHong Zhang   (*C)->info.mallocs           = ndouble;
39325296bd5SBarry Smith   (*C)->info.fill_ratio_given  = fill;
39425296bd5SBarry Smith   (*C)->info.fill_ratio_needed = afill;
39525296bd5SBarry Smith 
39625296bd5SBarry Smith #if defined(PETSC_USE_INFO)
39725296bd5SBarry Smith   if (ci[am]) {
39857622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
39957622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
40025296bd5SBarry Smith   } else {
40125296bd5SBarry Smith     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
40225296bd5SBarry Smith   }
40325296bd5SBarry Smith #endif
40425296bd5SBarry Smith   PetscFunctionReturn(0);
40525296bd5SBarry Smith }
40625296bd5SBarry Smith 
40725296bd5SBarry Smith 
40825023028SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
409e9e4536cSHong Zhang {
410e9e4536cSHong Zhang   PetscErrorCode     ierr;
411e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
412bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
41325c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
414e9e4536cSHong Zhang   MatScalar          *ca;
415e9e4536cSHong Zhang   PetscReal          afill;
416eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
417eca6b86aSHong Zhang   PetscTable         ta;
4180298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
419e9e4536cSHong Zhang 
420e9e4536cSHong Zhang   PetscFunctionBegin;
421bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
422bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
423bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
424854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
425bd958071SHong Zhang   ci[0] = 0;
426bd958071SHong Zhang 
427bd958071SHong Zhang   /* create and initialize a linked list */
428c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
429c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
430eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
431eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
432eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
433bd958071SHong Zhang 
434bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
435f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
436bd958071SHong Zhang   current_space = free_space;
437bd958071SHong Zhang 
438bd958071SHong Zhang   /* Determine ci and cj */
439bd958071SHong Zhang   for (i=0; i<am; i++) {
440bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
441bd958071SHong Zhang     aj   = a->j + ai[i];
442bd958071SHong Zhang     for (j=0; j<anzi; j++) {
443bd958071SHong Zhang       brow = aj[j];
444bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
445bd958071SHong Zhang       bj   = b->j + bi[brow];
446bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
447bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
448bd958071SHong Zhang     }
449bd958071SHong Zhang     cnzi = lnk[0];
450bd958071SHong Zhang 
451bd958071SHong Zhang     /* If free space is not available, make more free space */
452bd958071SHong Zhang     /* Double the amount of total space in the list */
453bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
454f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
455bd958071SHong Zhang       ndouble++;
456bd958071SHong Zhang     }
457bd958071SHong Zhang 
458bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
459bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4602205254eSKarl Rupp 
461bd958071SHong Zhang     current_space->array           += cnzi;
462bd958071SHong Zhang     current_space->local_used      += cnzi;
463bd958071SHong Zhang     current_space->local_remaining -= cnzi;
4642205254eSKarl Rupp 
465bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
466bd958071SHong Zhang   }
467bd958071SHong Zhang 
468bd958071SHong Zhang   /* Column indices are in the list of free space */
469bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
470bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
471854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
472bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
473bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
474e9e4536cSHong Zhang 
475e9e4536cSHong Zhang   /* Allocate space for ca */
476bd958071SHong Zhang   /*-----------------------*/
477854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
478e9e4536cSHong Zhang   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
479e9e4536cSHong Zhang 
480e9e4536cSHong Zhang   /* put together the new symbolic matrix */
481ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
48233d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
48302fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
484e9e4536cSHong Zhang 
485e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
486e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
487e9e4536cSHong Zhang   c          = (Mat_SeqAIJ*)((*C)->data);
488e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
489e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
490e9e4536cSHong Zhang   c->nonew   = 0;
4912205254eSKarl Rupp 
49225023028SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
493e9e4536cSHong Zhang 
494e9e4536cSHong Zhang   /* set MatInfo */
495e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
496e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
497e9e4536cSHong Zhang   c->maxnz                     = ci[am];
498e9e4536cSHong Zhang   c->nz                        = ci[am];
499bd958071SHong Zhang   (*C)->info.mallocs           = ndouble;
500e9e4536cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
501e9e4536cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
502e9e4536cSHong Zhang 
503e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
504e9e4536cSHong Zhang   if (ci[am]) {
50557622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
50657622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
507e9e4536cSHong Zhang   } else {
508e9e4536cSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
509e9e4536cSHong Zhang   }
510e9e4536cSHong Zhang #endif
511e9e4536cSHong Zhang   PetscFunctionReturn(0);
512e9e4536cSHong Zhang }
513e9e4536cSHong Zhang 
5140ced3a2bSJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
5150ced3a2bSJed Brown {
5160ced3a2bSJed Brown   PetscErrorCode     ierr;
5170ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
5180ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
5190ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
5200ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
5210ced3a2bSJed Brown   PetscReal          afill;
5220ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
5230298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
5240ced3a2bSJed Brown   PetscHeap          h;
5250ced3a2bSJed Brown 
5260ced3a2bSJed Brown   PetscFunctionBegin;
527cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
5280ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
5290ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
530854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
5310ced3a2bSJed Brown   ci[0] = 0;
5320ced3a2bSJed Brown 
5330ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
534f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
5350ced3a2bSJed Brown   current_space = free_space;
5360ced3a2bSJed Brown 
5370ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
538785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
5390ced3a2bSJed Brown 
5400ced3a2bSJed Brown   /* Determine ci and cj */
5410ced3a2bSJed Brown   for (i=0; i<am; i++) {
5420ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
5430ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
5440ced3a2bSJed Brown     ci[i+1] = ci[i];
5450ced3a2bSJed Brown     /* Populate the min heap */
5460ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
5470ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
5480ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
5490ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
5500ced3a2bSJed Brown       }
5510ced3a2bSJed Brown     }
5520ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
5530ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5540ced3a2bSJed Brown     while (j >= 0) {
5550ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
556f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
5570ced3a2bSJed Brown         ndouble++;
5580ced3a2bSJed Brown       }
5590ced3a2bSJed Brown       *(current_space->array++) = col;
5600ced3a2bSJed Brown       current_space->local_used++;
5610ced3a2bSJed Brown       current_space->local_remaining--;
5620ced3a2bSJed Brown       ci[i+1]++;
5630ced3a2bSJed Brown 
5640ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
5650ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
5660ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
5670ced3a2bSJed Brown         PetscInt j2,col2;
5680ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
5690ced3a2bSJed Brown         if (col2 != col) break;
5700ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
5710ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
5720ced3a2bSJed Brown       }
5730ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
5740ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
5750ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5760ced3a2bSJed Brown     }
5770ced3a2bSJed Brown   }
5780ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
5790ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
5800ced3a2bSJed Brown 
5810ced3a2bSJed Brown   /* Column indices are in the list of free space */
5820ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
5830ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
584785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
5850ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
5860ced3a2bSJed Brown 
5870ced3a2bSJed Brown   /* put together the new symbolic matrix */
588ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
58933d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
59002fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
5910ced3a2bSJed Brown 
5920ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5930ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5940ced3a2bSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
5950ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
5960ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
5970ced3a2bSJed Brown   c->nonew   = 0;
59826fbe8dcSKarl Rupp 
59989d95c1aSJed Brown   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
6000ced3a2bSJed Brown 
6010ced3a2bSJed Brown   /* set MatInfo */
6020ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6030ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6040ced3a2bSJed Brown   c->maxnz                     = ci[am];
6050ced3a2bSJed Brown   c->nz                        = ci[am];
6060ced3a2bSJed Brown   (*C)->info.mallocs           = ndouble;
6070ced3a2bSJed Brown   (*C)->info.fill_ratio_given  = fill;
6080ced3a2bSJed Brown   (*C)->info.fill_ratio_needed = afill;
6090ced3a2bSJed Brown 
6100ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6110ced3a2bSJed Brown   if (ci[am]) {
61257622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
61357622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
6140ced3a2bSJed Brown   } else {
6150ced3a2bSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
6160ced3a2bSJed Brown   }
6170ced3a2bSJed Brown #endif
6180ced3a2bSJed Brown   PetscFunctionReturn(0);
6190ced3a2bSJed Brown }
620e9e4536cSHong Zhang 
6218a07c6f1SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
6228a07c6f1SJed Brown {
6238a07c6f1SJed Brown   PetscErrorCode     ierr;
6248a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6258a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6268a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
6278a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6288a07c6f1SJed Brown   PetscReal          afill;
6298a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
6300298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6318a07c6f1SJed Brown   PetscHeap          h;
6328a07c6f1SJed Brown   PetscBT            bt;
6338a07c6f1SJed Brown 
6348a07c6f1SJed Brown   PetscFunctionBegin;
635cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
6368a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
6378a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
638854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6398a07c6f1SJed Brown   ci[0] = 0;
6408a07c6f1SJed Brown 
6418a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
642f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6432205254eSKarl Rupp 
6448a07c6f1SJed Brown   current_space = free_space;
6458a07c6f1SJed Brown 
6468a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
647785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6488a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
6498a07c6f1SJed Brown 
6508a07c6f1SJed Brown   /* Determine ci and cj */
6518a07c6f1SJed Brown   for (i=0; i<am; i++) {
6528a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6538a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6548a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
6558a07c6f1SJed Brown     ci[i+1] = ci[i];
6568a07c6f1SJed Brown     /* Populate the min heap */
6578a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
6588a07c6f1SJed Brown       PetscInt brow = acol[j];
6598a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
6608a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6618a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6628a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6638a07c6f1SJed Brown           bb[j]++;
6648a07c6f1SJed Brown           break;
6658a07c6f1SJed Brown         }
6668a07c6f1SJed Brown       }
6678a07c6f1SJed Brown     }
6688a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
6698a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6708a07c6f1SJed Brown     while (j >= 0) {
6718a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6720298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
673f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6748a07c6f1SJed Brown         ndouble++;
6758a07c6f1SJed Brown       }
6768a07c6f1SJed Brown       *(current_space->array++) = col;
6778a07c6f1SJed Brown       current_space->local_used++;
6788a07c6f1SJed Brown       current_space->local_remaining--;
6798a07c6f1SJed Brown       ci[i+1]++;
6808a07c6f1SJed Brown 
6818a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
6828a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
6838a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6848a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6858a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6868a07c6f1SJed Brown           bb[j]++;
6878a07c6f1SJed Brown           break;
6888a07c6f1SJed Brown         }
6898a07c6f1SJed Brown       }
6908a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6918a07c6f1SJed Brown     }
6928a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
6938a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
6948a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
6958a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
6968a07c6f1SJed Brown     }
6978a07c6f1SJed Brown   }
6988a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6998a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7008a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7018a07c6f1SJed Brown 
7028a07c6f1SJed Brown   /* Column indices are in the list of free space */
7038a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7048a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
705785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7068a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7078a07c6f1SJed Brown 
7088a07c6f1SJed Brown   /* put together the new symbolic matrix */
709ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
71033d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
71102fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
7128a07c6f1SJed Brown 
7138a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7148a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7158a07c6f1SJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
7168a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7178a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7188a07c6f1SJed Brown   c->nonew   = 0;
71926fbe8dcSKarl Rupp 
72089d95c1aSJed Brown   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
7218a07c6f1SJed Brown 
7228a07c6f1SJed Brown   /* set MatInfo */
7238a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7248a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7258a07c6f1SJed Brown   c->maxnz                     = ci[am];
7268a07c6f1SJed Brown   c->nz                        = ci[am];
7278a07c6f1SJed Brown   (*C)->info.mallocs           = ndouble;
7288a07c6f1SJed Brown   (*C)->info.fill_ratio_given  = fill;
7298a07c6f1SJed Brown   (*C)->info.fill_ratio_needed = afill;
7308a07c6f1SJed Brown 
7318a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7328a07c6f1SJed Brown   if (ci[am]) {
73357622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
73457622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7358a07c6f1SJed Brown   } else {
7368a07c6f1SJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
7378a07c6f1SJed Brown   }
7388a07c6f1SJed Brown #endif
7398a07c6f1SJed Brown   PetscFunctionReturn(0);
7408a07c6f1SJed Brown }
7418a07c6f1SJed Brown 
742d7ed1a4dSandi selinger 
743d7ed1a4dSandi selinger PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C)
744d7ed1a4dSandi selinger {
745d7ed1a4dSandi selinger   PetscErrorCode     ierr;
746d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
747d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
748d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
749d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
750d7ed1a4dSandi selinger   const PetscInt     workcol[8] = {0,1,2,3,4,5,6,7};
751d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
752d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
753d7ed1a4dSandi selinger   PetscInt           window[8];
754d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
755d7ed1a4dSandi selinger   PetscInt           i,k,ndouble = 0,L1_rowsleft,rowsleft;
756d7ed1a4dSandi selinger   PetscReal          afill;
757d7ed1a4dSandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3_in,*workj_L3_out;
758*7660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
759d7ed1a4dSandi selinger 
760d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
761d7ed1a4dSandi selinger              Because of the way virtual memory works,
762d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
763d7ed1a4dSandi selinger   PetscFunctionBegin;
764d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
765d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
766d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
767d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
768d7ed1a4dSandi selinger     a_rownnz = 0;
769d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
770d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
771d7ed1a4dSandi selinger       if (a_rownnz > bn) {
772d7ed1a4dSandi selinger         a_rownnz = bn;
773d7ed1a4dSandi selinger         break;
774d7ed1a4dSandi selinger       }
775d7ed1a4dSandi selinger     }
776d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
777d7ed1a4dSandi selinger   }
778d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
779d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
780*7660c4dbSandi selinger   ierr = PetscMalloc1(a_maxrownnz*10,&workj_L2);CHKERRQ(ierr);
781d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3_in);CHKERRQ(ierr);
782d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3_out);CHKERRQ(ierr);
783d7ed1a4dSandi selinger 
784*7660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
785*7660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
786d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
787d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
788d7ed1a4dSandi selinger 
789d7ed1a4dSandi selinger   ci_nnz       = 0;
790d7ed1a4dSandi selinger   ci[0]        = 0;
791d7ed1a4dSandi selinger   worki_L1[0]  = 0;
792d7ed1a4dSandi selinger   worki_L2[0]  = 0;
793d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
794d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
795d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
796d7ed1a4dSandi selinger     rowsleft             = anzi;
797d7ed1a4dSandi selinger     inputcol_L1          = acol;
798d7ed1a4dSandi selinger     L2_nnz               = 0;
799*7660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
800*7660c4dbSandi selinger     worki_L2[1]          = 0;
801d7ed1a4dSandi selinger 
802d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
803d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
804d7ed1a4dSandi selinger       c_maxmem *= 2;
805d7ed1a4dSandi selinger       ndouble++;
806d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
807d7ed1a4dSandi selinger     }
808d7ed1a4dSandi selinger 
809d7ed1a4dSandi selinger     while (rowsleft) {
810d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
811d7ed1a4dSandi selinger       L1_nrows    = 0;
812*7660c4dbSandi selinger       L1_nnz      = 0;
813d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
814d7ed1a4dSandi selinger       inputi      = bi;
815d7ed1a4dSandi selinger       inputj      = bj;
816d7ed1a4dSandi selinger 
817d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
818d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
819*7660c4dbSandi selinger           Input:  inputj   inputi  L3_nnz inputcol  bn
820d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
821d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
822d7ed1a4dSandi selinger          window_min  = bn;                                                   \
823*7660c4dbSandi selinger          outputi_nnz = 0;                                                    \
824*7660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
825d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
826d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
827d7ed1a4dSandi selinger          }                                                                   \
828d7ed1a4dSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
829d7ed1a4dSandi selinger            window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;     \
830d7ed1a4dSandi selinger            window_min = PetscMin(window[k], window_min);                     \
831d7ed1a4dSandi selinger          }                                                                   \
832d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
833d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
834d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
835d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
836d7ed1a4dSandi selinger            window_min = bn;                                                  \
837d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
838d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
839d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
840d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
841d7ed1a4dSandi selinger              }                                                               \
842d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
843d7ed1a4dSandi selinger            }                                                                 \
844d7ed1a4dSandi selinger          }
845d7ed1a4dSandi selinger 
846d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
847d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
848d7ed1a4dSandi selinger       while (L1_rowsleft) {
849*7660c4dbSandi selinger         outputi_nnz = 0;
850*7660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
851*7660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
852*7660c4dbSandi selinger 
853d7ed1a4dSandi selinger         switch (L1_rowsleft) {
854d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
855d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
856d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
857d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
858d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
859d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
860d7ed1a4dSandi selinger                  break;
861d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
862d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
863d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
864d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
865d7ed1a4dSandi selinger                  break;
866d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
867d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
868d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
869d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
870d7ed1a4dSandi selinger                  break;
871d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
872d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
873d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
874d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
875d7ed1a4dSandi selinger                  break;
876d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
877d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
878d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
879d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
880d7ed1a4dSandi selinger                  break;
881d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
882d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
883d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
884d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
885d7ed1a4dSandi selinger                  break;
886d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
887d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
888d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
889d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
890d7ed1a4dSandi selinger                  break;
891d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
892d7ed1a4dSandi selinger                  inputcol    += 8;
893d7ed1a4dSandi selinger                  rowsleft    -= 8;
894d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
895d7ed1a4dSandi selinger                  break;
896d7ed1a4dSandi selinger         }
897d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
898*7660c4dbSandi selinger         L1_nnz               += outputi_nnz;
899*7660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
900d7ed1a4dSandi selinger       }
901d7ed1a4dSandi selinger 
902d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
903d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
904d7ed1a4dSandi selinger       if (anzi > 8) {
905d7ed1a4dSandi selinger         inputi      = worki_L1;
906d7ed1a4dSandi selinger         inputj      = workj_L1;
907d7ed1a4dSandi selinger         inputcol    = workcol;
908d7ed1a4dSandi selinger         outputi_nnz = 0;
909d7ed1a4dSandi selinger 
910d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
911d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
912d7ed1a4dSandi selinger 
913d7ed1a4dSandi selinger         switch (L1_nrows) {
914d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
915d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
916d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
917d7ed1a4dSandi selinger                  break;
918d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
919d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
920d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
921d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
922d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
923d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
924d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
925d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
926d7ed1a4dSandi selinger         }
927d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
928d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
929d7ed1a4dSandi selinger 
930*7660c4dbSandi selinger         /************************ L E V E L  3 **********************/
931*7660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
932d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
933d7ed1a4dSandi selinger           inputi      = worki_L2;
934d7ed1a4dSandi selinger           inputj      = workj_L2;
935d7ed1a4dSandi selinger           inputcol    = workcol;
936d7ed1a4dSandi selinger           outputi_nnz = 0;
937d7ed1a4dSandi selinger           if (rowsleft) outputj = workj_L3_out;
938d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
939d7ed1a4dSandi selinger           switch (L2_nrows) {
940d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
941d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
942d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
943d7ed1a4dSandi selinger                    break;
944d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
945d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
946d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
947d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
948d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
949d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
950d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
951d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
952d7ed1a4dSandi selinger           }
953d7ed1a4dSandi selinger           L2_nrows    = 1;
954*7660c4dbSandi selinger           L2_nnz      = outputi_nnz;
955*7660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
956*7660c4dbSandi selinger           /* Copy to workj_L2 */
957d7ed1a4dSandi selinger           if (rowsleft) {
958*7660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
959d7ed1a4dSandi selinger           }
960d7ed1a4dSandi selinger         }
961d7ed1a4dSandi selinger       }
962d7ed1a4dSandi selinger     }  /* while (rowsleft) */
963d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
964d7ed1a4dSandi selinger 
965d7ed1a4dSandi selinger     /* terminate current row */
966d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
967d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
968d7ed1a4dSandi selinger   }
969d7ed1a4dSandi selinger 
970d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
971d7ed1a4dSandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
972d7ed1a4dSandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
973d7ed1a4dSandi selinger 
974d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
975d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
976d7ed1a4dSandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
977d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
978d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
979d7ed1a4dSandi selinger   c->nonew   = 0;
980d7ed1a4dSandi selinger 
981d7ed1a4dSandi selinger   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
982d7ed1a4dSandi selinger 
983d7ed1a4dSandi selinger   /* set MatInfo */
984d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
985d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
986d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
987d7ed1a4dSandi selinger   c->nz                        = ci[am];
988d7ed1a4dSandi selinger   (*C)->info.mallocs           = ndouble;
989d7ed1a4dSandi selinger   (*C)->info.fill_ratio_given  = fill;
990d7ed1a4dSandi selinger   (*C)->info.fill_ratio_needed = afill;
991d7ed1a4dSandi selinger 
992d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
993d7ed1a4dSandi selinger   if (ci[am]) {
994d7ed1a4dSandi selinger     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
995d7ed1a4dSandi selinger     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
996d7ed1a4dSandi selinger   } else {
997d7ed1a4dSandi selinger     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
998d7ed1a4dSandi selinger   }
999d7ed1a4dSandi selinger #endif
1000d7ed1a4dSandi selinger 
1001d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1002d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1003d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1004d7ed1a4dSandi selinger   ierr = PetscFree(workj_L3_in);CHKERRQ(ierr);
1005d7ed1a4dSandi selinger   ierr = PetscFree(workj_L3_out);CHKERRQ(ierr);
1006d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1007d7ed1a4dSandi selinger }
1008d7ed1a4dSandi selinger 
1009cd093f1dSJed Brown /* concatenate unique entries and then sort */
101058cf0668SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1011cd093f1dSJed Brown {
1012cd093f1dSJed Brown   PetscErrorCode     ierr;
1013cd093f1dSJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1014cd093f1dSJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1015cd093f1dSJed Brown   PetscInt           *ci,*cj;
1016cd093f1dSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1017cd093f1dSJed Brown   PetscReal          afill;
1018cd093f1dSJed Brown   PetscInt           i,j,ndouble = 0;
1019cd093f1dSJed Brown   PetscSegBuffer     seg,segrow;
1020cd093f1dSJed Brown   char               *seen;
1021cd093f1dSJed Brown 
1022cd093f1dSJed Brown   PetscFunctionBegin;
1023854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1024cd093f1dSJed Brown   ci[0] = 0;
1025cd093f1dSJed Brown 
1026cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1027cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1028cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1029785e854fSJed Brown   ierr = PetscMalloc1(bn,&seen);CHKERRQ(ierr);
1030cd093f1dSJed Brown   ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr);
1031cd093f1dSJed Brown 
1032cd093f1dSJed Brown   /* Determine ci and cj */
1033cd093f1dSJed Brown   for (i=0; i<am; i++) {
1034cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1035cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1036cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1037cd093f1dSJed Brown     /* Pack segrow */
1038cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1039cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1040cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
1041cd093f1dSJed Brown         PetscInt bcol = bj[k];
1042cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1043cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1044cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1045cd093f1dSJed Brown           *slot = bcol;
1046cd093f1dSJed Brown           seen[bcol] = 1;
1047cd093f1dSJed Brown           packlen++;
1048cd093f1dSJed Brown         }
1049cd093f1dSJed Brown       }
1050cd093f1dSJed Brown     }
1051cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1052cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1053cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1054cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1055cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1056cd093f1dSJed Brown   }
1057cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1058cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1059cd093f1dSJed Brown 
1060cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1061cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1062cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1063cd093f1dSJed Brown 
1064cd093f1dSJed Brown   /* put together the new symbolic matrix */
1065cd093f1dSJed Brown   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
106633d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
106702fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1068cd093f1dSJed Brown 
1069cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1070cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1071cd093f1dSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
1072cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1073cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1074cd093f1dSJed Brown   c->nonew   = 0;
1075cd093f1dSJed Brown 
1076cd093f1dSJed Brown   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
1077cd093f1dSJed Brown 
1078cd093f1dSJed Brown   /* set MatInfo */
1079cd093f1dSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1080cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1081cd093f1dSJed Brown   c->maxnz                     = ci[am];
1082cd093f1dSJed Brown   c->nz                        = ci[am];
1083cd093f1dSJed Brown   (*C)->info.mallocs           = ndouble;
1084cd093f1dSJed Brown   (*C)->info.fill_ratio_given  = fill;
1085cd093f1dSJed Brown   (*C)->info.fill_ratio_needed = afill;
1086cd093f1dSJed Brown 
1087cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1088cd093f1dSJed Brown   if (ci[am]) {
108957622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
109057622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1091cd093f1dSJed Brown   } else {
1092cd093f1dSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
1093cd093f1dSJed Brown   }
1094cd093f1dSJed Brown #endif
1095cd093f1dSJed Brown   PetscFunctionReturn(0);
1096cd093f1dSJed Brown }
1097cd093f1dSJed Brown 
1098d2853540SHong Zhang /* This routine is not used. Should be removed! */
10996fc122caSHong Zhang PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
11005df89d91SHong Zhang {
1101bc011b1eSHong Zhang   PetscErrorCode ierr;
1102bc011b1eSHong Zhang 
1103bc011b1eSHong Zhang   PetscFunctionBegin;
1104bc011b1eSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
11053ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
11066fc122caSHong Zhang     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
11073ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1108bc011b1eSHong Zhang   }
11093ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
11106fc122caSHong Zhang   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
11113ff4c91cSHong Zhang   ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
1112bc011b1eSHong Zhang   PetscFunctionReturn(0);
1113bc011b1eSHong Zhang }
1114bc011b1eSHong Zhang 
11152128a86cSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
11162128a86cSHong Zhang {
11172128a86cSHong Zhang   PetscErrorCode      ierr;
11184c7df5ccSHong Zhang   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
111940192850SHong Zhang   Mat_MatMatTransMult *abt=a->abt;
11202128a86cSHong Zhang 
11212128a86cSHong Zhang   PetscFunctionBegin;
112240192850SHong Zhang   ierr = (abt->destroy)(A);CHKERRQ(ierr);
112340192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
112440192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
112540192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
112640192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
11272128a86cSHong Zhang   PetscFunctionReturn(0);
11282128a86cSHong Zhang }
11292128a86cSHong Zhang 
11306fc122caSHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1131bc011b1eSHong Zhang {
11325df89d91SHong Zhang   PetscErrorCode      ierr;
113381d82fe4SHong Zhang   Mat                 Bt;
113481d82fe4SHong Zhang   PetscInt            *bti,*btj;
113540192850SHong Zhang   Mat_MatMatTransMult *abt;
11364c7df5ccSHong Zhang   Mat_SeqAIJ          *c;
1137d2853540SHong Zhang 
113881d82fe4SHong Zhang   PetscFunctionBegin;
113981d82fe4SHong Zhang   /* create symbolic Bt */
114081d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
11410298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
114233d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
114304b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
114481d82fe4SHong Zhang 
114581d82fe4SHong Zhang   /* get symbolic C=A*Bt */
114681d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
114781d82fe4SHong Zhang 
11482128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1149b00a9115SJed Brown   ierr   = PetscNew(&abt);CHKERRQ(ierr);
11504c7df5ccSHong Zhang   c      = (Mat_SeqAIJ*)(*C)->data;
115140192850SHong Zhang   c->abt = abt;
11522128a86cSHong Zhang 
115340192850SHong Zhang   abt->usecoloring = PETSC_FALSE;
115440192850SHong Zhang   abt->destroy     = (*C)->ops->destroy;
11552128a86cSHong Zhang   (*C)->ops->destroy     = MatDestroy_SeqAIJ_MatMatMultTrans;
11562128a86cSHong Zhang 
1157c5929fdfSBarry Smith   ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr);
115840192850SHong Zhang   if (abt->usecoloring) {
1159b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1160b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1161335efc43SPeter Brune     MatColoring          coloring;
11622128a86cSHong Zhang     ISColoring           iscoloring;
11632128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
11644d478ae7SHong Zhang     Mat_SeqAIJ           *c=(Mat_SeqAIJ*)(*C)->data;
11654d478ae7SHong Zhang     /* inode causes memory problem, don't know why */
11664d478ae7SHong Zhang     if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
1167e8354b3eSHong Zhang 
1168335efc43SPeter Brune     ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr);
1169335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1170335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1171335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1172335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1173335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1174b9af6bddSHong Zhang     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
11752205254eSKarl Rupp 
117640192850SHong Zhang     abt->matcoloring = matcoloring;
11772205254eSKarl Rupp 
11782128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
11792128a86cSHong Zhang 
11802128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
11812128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
11822128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11832128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
11840298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
11852205254eSKarl Rupp 
11862128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
118740192850SHong Zhang     abt->Bt_den   = Bt_dense;
11882128a86cSHong Zhang 
11892128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
11902128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11912128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
11920298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
11932205254eSKarl Rupp 
11942128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
119540192850SHong Zhang     abt->ABt_den  = C_dense;
1196f94ccd6cSHong Zhang 
1197f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
11981ce71dffSSatish Balay     {
1199f94ccd6cSHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
1200c40ebe3bSHong Zhang       ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
12011ce71dffSSatish Balay     }
1202f94ccd6cSHong Zhang #endif
12032128a86cSHong Zhang   }
1204e99be685SHong Zhang   /* clean up */
1205e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1206e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12075df89d91SHong Zhang   PetscFunctionReturn(0);
12085df89d91SHong Zhang }
12095df89d91SHong Zhang 
12106fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12115df89d91SHong Zhang {
12125df89d91SHong Zhang   PetscErrorCode      ierr;
12135df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1214e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
121581d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12165df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1217aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
121840192850SHong Zhang   Mat_MatMatTransMult *abt = c->abt;
12195df89d91SHong Zhang 
12205df89d91SHong Zhang   PetscFunctionBegin;
122158ed3ceaSHong Zhang   /* clear old values in C */
122258ed3ceaSHong Zhang   if (!c->a) {
1223854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
122458ed3ceaSHong Zhang     c->a      = ca;
122558ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
122658ed3ceaSHong Zhang   } else {
122758ed3ceaSHong Zhang     ca =  c->a;
122858ed3ceaSHong Zhang   }
122958ed3ceaSHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
123058ed3ceaSHong Zhang 
123140192850SHong Zhang   if (abt->usecoloring) {
123240192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
123340192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1234c8db22f6SHong Zhang 
1235b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
123640192850SHong Zhang     Bt_dense = abt->Bt_den;
1237b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1238c8db22f6SHong Zhang 
1239c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
12402128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1241c8db22f6SHong Zhang 
12422128a86cSHong Zhang     /* Recover C from C_dense */
1243b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1244c8db22f6SHong Zhang     PetscFunctionReturn(0);
1245c8db22f6SHong Zhang   }
1246c8db22f6SHong Zhang 
12471fa1209cSHong Zhang   for (i=0; i<cm; i++) {
124881d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
124981d82fe4SHong Zhang     acol = aj + ai[i];
12506973769bSHong Zhang     aval = aa + ai[i];
12511fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
12521fa1209cSHong Zhang     ccol = cj + ci[i];
12536973769bSHong Zhang     cval = ca + ci[i];
12541fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
125581d82fe4SHong Zhang       brow = ccol[j];
125681d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
125781d82fe4SHong Zhang       bcol = bj + bi[brow];
12586973769bSHong Zhang       bval = ba + bi[brow];
12596973769bSHong Zhang 
126081d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
126181d82fe4SHong Zhang       nexta = 0; nextb = 0;
126281d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
12637b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
126481d82fe4SHong Zhang         if (nexta == anzi) break;
12657b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
126681d82fe4SHong Zhang         if (nextb == bnzj) break;
126781d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
12686973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
126981d82fe4SHong Zhang           nexta++; nextb++;
127081d82fe4SHong Zhang           flops += 2;
12711fa1209cSHong Zhang         }
12721fa1209cSHong Zhang       }
127381d82fe4SHong Zhang     }
127481d82fe4SHong Zhang   }
127581d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
127681d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
127781d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
12785df89d91SHong Zhang   PetscFunctionReturn(0);
12795df89d91SHong Zhang }
12805df89d91SHong Zhang 
12816d373c3eSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
12826d373c3eSHong Zhang {
12836d373c3eSHong Zhang   PetscErrorCode      ierr;
12846d373c3eSHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
12856d373c3eSHong Zhang   Mat_MatTransMatMult *atb = a->atb;
12866d373c3eSHong Zhang 
12876d373c3eSHong Zhang   PetscFunctionBegin;
12886d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
12896d373c3eSHong Zhang   ierr = (atb->destroy)(A);CHKERRQ(ierr);
12906d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
12916d373c3eSHong Zhang   PetscFunctionReturn(0);
12926d373c3eSHong Zhang }
12936d373c3eSHong Zhang 
12940adebc6cSBarry Smith PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
12950adebc6cSBarry Smith {
12965df89d91SHong Zhang   PetscErrorCode      ierr;
12976d373c3eSHong Zhang   const char          *algTypes[2] = {"matmatmult","outerproduct"};
12986d373c3eSHong Zhang   PetscInt            alg=0; /* set default algorithm */
12996d373c3eSHong Zhang   Mat                 At;
13006d373c3eSHong Zhang   Mat_MatTransMatMult *atb;
13016d373c3eSHong Zhang   Mat_SeqAIJ          *c;
13025df89d91SHong Zhang 
13035df89d91SHong Zhang   PetscFunctionBegin;
13045df89d91SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
13056d373c3eSHong Zhang     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
13066d373c3eSHong Zhang     PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
13076d373c3eSHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr);
13086d373c3eSHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
13096d373c3eSHong Zhang 
13106d373c3eSHong Zhang     switch (alg) {
13116d373c3eSHong Zhang     case 1:
131275648e8dSHong Zhang       ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
13136d373c3eSHong Zhang       break;
13146d373c3eSHong Zhang     default:
13156d373c3eSHong Zhang       ierr = PetscNew(&atb);CHKERRQ(ierr);
13166d373c3eSHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
13176d373c3eSHong Zhang       ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
13186d373c3eSHong Zhang 
1319618cf492SHong Zhang       c                  = (Mat_SeqAIJ*)(*C)->data;
13206d373c3eSHong Zhang       c->atb             = atb;
13216d373c3eSHong Zhang       atb->At            = At;
13226d373c3eSHong Zhang       atb->destroy       = (*C)->ops->destroy;
13236d373c3eSHong Zhang       (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
13246d373c3eSHong Zhang 
13256d373c3eSHong Zhang       break;
13265df89d91SHong Zhang     }
13276d373c3eSHong Zhang   }
13286d373c3eSHong Zhang   if (alg) {
13296d373c3eSHong Zhang     ierr = (*(*C)->ops->mattransposemultnumeric)(A,B,*C);CHKERRQ(ierr);
13306d373c3eSHong Zhang   } else if (!alg && scall == MAT_REUSE_MATRIX) {
13316d373c3eSHong Zhang     c   = (Mat_SeqAIJ*)(*C)->data;
13326d373c3eSHong Zhang     atb = c->atb;
13336d373c3eSHong Zhang     At  = atb->At;
13346d373c3eSHong Zhang     ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
13356d373c3eSHong Zhang     ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);CHKERRQ(ierr);
13366d373c3eSHong Zhang   }
13375df89d91SHong Zhang   PetscFunctionReturn(0);
13385df89d91SHong Zhang }
13395df89d91SHong Zhang 
134075648e8dSHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
13415df89d91SHong Zhang {
1342bc011b1eSHong Zhang   PetscErrorCode ierr;
1343bc011b1eSHong Zhang   Mat            At;
134438baddfdSBarry Smith   PetscInt       *ati,*atj;
1345bc011b1eSHong Zhang 
1346bc011b1eSHong Zhang   PetscFunctionBegin;
1347bc011b1eSHong Zhang   /* create symbolic At */
1348bc011b1eSHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13490298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
135033d57670SJed Brown   ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
135104b858e0SBarry Smith   ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1352bc011b1eSHong Zhang 
1353bc011b1eSHong Zhang   /* get symbolic C=At*B */
1354bc011b1eSHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1355bc011b1eSHong Zhang 
1356bc011b1eSHong Zhang   /* clean up */
13576bf464f9SBarry Smith   ierr = MatDestroy(&At);CHKERRQ(ierr);
1358bc011b1eSHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13596d373c3eSHong Zhang 
13606d373c3eSHong Zhang   (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ;
1361bc011b1eSHong Zhang   PetscFunctionReturn(0);
1362bc011b1eSHong Zhang }
1363bc011b1eSHong Zhang 
136475648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1365bc011b1eSHong Zhang {
1366bc011b1eSHong Zhang   PetscErrorCode ierr;
13670fbc74f4SHong Zhang   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1368d0f46423SBarry Smith   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1369d0f46423SBarry Smith   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1370d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1371aa1aec99SHong Zhang   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1372bc011b1eSHong Zhang 
1373bc011b1eSHong Zhang   PetscFunctionBegin;
1374aa1aec99SHong Zhang   if (!c->a) {
1375854ce69bSBarry Smith     ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
13762205254eSKarl Rupp 
1377aa1aec99SHong Zhang     c->a      = ca;
1378aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1379aa1aec99SHong Zhang   } else {
1380aa1aec99SHong Zhang     ca = c->a;
1381aa1aec99SHong Zhang   }
1382bc011b1eSHong Zhang   /* clear old values in C */
1383bc011b1eSHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1384bc011b1eSHong Zhang 
1385bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1386bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1387bc011b1eSHong Zhang     bj   = b->j + bi[i];
1388bc011b1eSHong Zhang     ba   = b->a + bi[i];
1389bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1390bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1391bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1392bc011b1eSHong Zhang       nextb = 0;
13930fbc74f4SHong Zhang       crow  = *aj++;
1394bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1395bc011b1eSHong Zhang       caj   = ca + ci[crow];
1396bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1397bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
13980fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
13990fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1400bc011b1eSHong Zhang           nextb++;
1401bc011b1eSHong Zhang         }
1402bc011b1eSHong Zhang       }
1403bc011b1eSHong Zhang       flops += 2*bnzi;
14040fbc74f4SHong Zhang       aa++;
1405bc011b1eSHong Zhang     }
1406bc011b1eSHong Zhang   }
1407bc011b1eSHong Zhang 
1408bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1409bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1410bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1411bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1412bc011b1eSHong Zhang   PetscFunctionReturn(0);
1413bc011b1eSHong Zhang }
14149123193aSHong Zhang 
1415150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
14169123193aSHong Zhang {
14179123193aSHong Zhang   PetscErrorCode ierr;
14189123193aSHong Zhang 
14199123193aSHong Zhang   PetscFunctionBegin;
14209123193aSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
14213ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14229123193aSHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
14233ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14249123193aSHong Zhang   }
14253ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14269123193aSHong Zhang   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
14274614e006SHong Zhang   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14289123193aSHong Zhang   PetscFunctionReturn(0);
14299123193aSHong Zhang }
1430edd81eecSMatthew Knepley 
14319123193aSHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
14329123193aSHong Zhang {
14339123193aSHong Zhang   PetscErrorCode ierr;
14349123193aSHong Zhang 
14359123193aSHong Zhang   PetscFunctionBegin;
14365a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14372205254eSKarl Rupp 
1438d73949e8SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14399123193aSHong Zhang   PetscFunctionReturn(0);
14409123193aSHong Zhang }
14419123193aSHong Zhang 
14429123193aSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
14439123193aSHong Zhang {
1444f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1445612438f1SStefano Zampini   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
14469123193aSHong Zhang   PetscErrorCode    ierr;
1447daf57211SHong Zhang   PetscScalar       *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1448daf57211SHong Zhang   const PetscScalar *aa,*b1,*b2,*b3,*b4;
1449daf57211SHong Zhang   const PetscInt    *aj;
1450612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1451daf57211SHong Zhang   PetscInt          am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
14529123193aSHong Zhang 
14539123193aSHong Zhang   PetscFunctionBegin;
1454f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1455612438f1SStefano Zampini   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1456e32f2f54SBarry Smith   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1457e32f2f54SBarry Smith   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1458612438f1SStefano Zampini   b = bd->v;
14598c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1460f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1461730858b9SHong Zhang   c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1462f32d5d43SBarry Smith   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1463f32d5d43SBarry Smith     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1464f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1465f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1466f32d5d43SBarry Smith       aj = a->j + a->i[i];
1467f32d5d43SBarry Smith       aa = a->a + a->i[i];
1468f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1469730858b9SHong Zhang         aatmp = aa[j]; ajtmp = aj[j];
1470730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1471730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1472730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1473730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
14749123193aSHong Zhang       }
1475730858b9SHong Zhang       c1[i] = r1;
1476730858b9SHong Zhang       c2[i] = r2;
1477730858b9SHong Zhang       c3[i] = r3;
1478730858b9SHong Zhang       c4[i] = r4;
1479f32d5d43SBarry Smith     }
1480730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1481730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1482f32d5d43SBarry Smith   }
1483f32d5d43SBarry Smith   for (; col<cn; col++) {   /* over extra columns of C */
1484f32d5d43SBarry Smith     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1485f32d5d43SBarry Smith       r1 = 0.0;
1486f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1487f32d5d43SBarry Smith       aj = a->j + a->i[i];
1488f32d5d43SBarry Smith       aa = a->a + a->i[i];
1489f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1490730858b9SHong Zhang         r1 += aa[j]*b1[aj[j]];
1491f32d5d43SBarry Smith       }
1492730858b9SHong Zhang       c1[i] = r1;
1493f32d5d43SBarry Smith     }
1494f32d5d43SBarry Smith     b1 += bm;
1495730858b9SHong Zhang     c1 += am;
1496f32d5d43SBarry Smith   }
1497dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
14988c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1499f32d5d43SBarry Smith   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1500f32d5d43SBarry Smith   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1501f32d5d43SBarry Smith   PetscFunctionReturn(0);
1502f32d5d43SBarry Smith }
1503f32d5d43SBarry Smith 
1504f32d5d43SBarry Smith /*
15054324174eSBarry Smith    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1506f32d5d43SBarry Smith */
1507f32d5d43SBarry Smith PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1508f32d5d43SBarry Smith {
1509f32d5d43SBarry Smith   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
151090f5ea3eSStefano Zampini   Mat_SeqDense   *bd = (Mat_SeqDense*)B->data;
1511f32d5d43SBarry Smith   PetscErrorCode ierr;
1512dd6ea824SBarry Smith   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1513dd6ea824SBarry Smith   MatScalar      *aa;
151490f5ea3eSStefano Zampini   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
15154324174eSBarry Smith   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1516f32d5d43SBarry Smith 
1517f32d5d43SBarry Smith   PetscFunctionBegin;
1518f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
151990f5ea3eSStefano Zampini   b = bd->v;
15208c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1521f32d5d43SBarry Smith   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
15224324174eSBarry Smith 
15234324174eSBarry Smith   if (a->compressedrow.use) { /* use compressed row format */
15244324174eSBarry Smith     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
15254324174eSBarry Smith       colam = col*am;
15264324174eSBarry Smith       arm   = a->compressedrow.nrows;
15274324174eSBarry Smith       ii    = a->compressedrow.i;
15284324174eSBarry Smith       ridx  = a->compressedrow.rindex;
15294324174eSBarry Smith       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
15304324174eSBarry Smith         r1 = r2 = r3 = r4 = 0.0;
15314324174eSBarry Smith         n  = ii[i+1] - ii[i];
15324324174eSBarry Smith         aj = a->j + ii[i];
15334324174eSBarry Smith         aa = a->a + ii[i];
15344324174eSBarry Smith         for (j=0; j<n; j++) {
15354324174eSBarry Smith           r1 += (*aa)*b1[*aj];
15364324174eSBarry Smith           r2 += (*aa)*b2[*aj];
15374324174eSBarry Smith           r3 += (*aa)*b3[*aj];
15384324174eSBarry Smith           r4 += (*aa++)*b4[*aj++];
15394324174eSBarry Smith         }
15404324174eSBarry Smith         c[colam       + ridx[i]] += r1;
15414324174eSBarry Smith         c[colam + am  + ridx[i]] += r2;
15424324174eSBarry Smith         c[colam + am2 + ridx[i]] += r3;
15434324174eSBarry Smith         c[colam + am3 + ridx[i]] += r4;
15444324174eSBarry Smith       }
15454324174eSBarry Smith       b1 += bm4;
15464324174eSBarry Smith       b2 += bm4;
15474324174eSBarry Smith       b3 += bm4;
15484324174eSBarry Smith       b4 += bm4;
15494324174eSBarry Smith     }
15504324174eSBarry Smith     for (; col<cn; col++) {     /* over extra columns of C */
15514324174eSBarry Smith       colam = col*am;
15524324174eSBarry Smith       arm   = a->compressedrow.nrows;
15534324174eSBarry Smith       ii    = a->compressedrow.i;
15544324174eSBarry Smith       ridx  = a->compressedrow.rindex;
15554324174eSBarry Smith       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
15564324174eSBarry Smith         r1 = 0.0;
15574324174eSBarry Smith         n  = ii[i+1] - ii[i];
15584324174eSBarry Smith         aj = a->j + ii[i];
15594324174eSBarry Smith         aa = a->a + ii[i];
15604324174eSBarry Smith 
15614324174eSBarry Smith         for (j=0; j<n; j++) {
15624324174eSBarry Smith           r1 += (*aa++)*b1[*aj++];
15634324174eSBarry Smith         }
1564a2ea699eSBarry Smith         c[colam + ridx[i]] += r1;
15654324174eSBarry Smith       }
15664324174eSBarry Smith       b1 += bm;
15674324174eSBarry Smith     }
15684324174eSBarry Smith   } else {
1569f32d5d43SBarry Smith     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1570f32d5d43SBarry Smith       colam = col*am;
1571f32d5d43SBarry Smith       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1572f32d5d43SBarry Smith         r1 = r2 = r3 = r4 = 0.0;
1573f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1574f32d5d43SBarry Smith         aj = a->j + a->i[i];
1575f32d5d43SBarry Smith         aa = a->a + a->i[i];
1576f32d5d43SBarry Smith         for (j=0; j<n; j++) {
1577f32d5d43SBarry Smith           r1 += (*aa)*b1[*aj];
1578f32d5d43SBarry Smith           r2 += (*aa)*b2[*aj];
1579f32d5d43SBarry Smith           r3 += (*aa)*b3[*aj];
1580f32d5d43SBarry Smith           r4 += (*aa++)*b4[*aj++];
1581f32d5d43SBarry Smith         }
1582f32d5d43SBarry Smith         c[colam + i]       += r1;
1583f32d5d43SBarry Smith         c[colam + am + i]  += r2;
1584f32d5d43SBarry Smith         c[colam + am2 + i] += r3;
1585f32d5d43SBarry Smith         c[colam + am3 + i] += r4;
1586f32d5d43SBarry Smith       }
1587f32d5d43SBarry Smith       b1 += bm4;
1588f32d5d43SBarry Smith       b2 += bm4;
1589f32d5d43SBarry Smith       b3 += bm4;
1590f32d5d43SBarry Smith       b4 += bm4;
1591f32d5d43SBarry Smith     }
1592f32d5d43SBarry Smith     for (; col<cn; col++) {     /* over extra columns of C */
1593a2ea699eSBarry Smith       colam = col*am;
1594f32d5d43SBarry Smith       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1595f32d5d43SBarry Smith         r1 = 0.0;
1596f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1597f32d5d43SBarry Smith         aj = a->j + a->i[i];
1598f32d5d43SBarry Smith         aa = a->a + a->i[i];
1599f32d5d43SBarry Smith 
1600f32d5d43SBarry Smith         for (j=0; j<n; j++) {
1601f32d5d43SBarry Smith           r1 += (*aa++)*b1[*aj++];
1602f32d5d43SBarry Smith         }
1603a2ea699eSBarry Smith         c[colam + i] += r1;
1604f32d5d43SBarry Smith       }
1605f32d5d43SBarry Smith       b1 += bm;
1606f32d5d43SBarry Smith     }
16074324174eSBarry Smith   }
1608dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
16098c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
16109123193aSHong Zhang   PetscFunctionReturn(0);
16119123193aSHong Zhang }
1612b1683b59SHong Zhang 
1613b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1614c8db22f6SHong Zhang {
1615c8db22f6SHong Zhang   PetscErrorCode ierr;
16162f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
16172f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
16182f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
16192f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
16202f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
16212f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1622c8db22f6SHong Zhang 
1623c8db22f6SHong Zhang   PetscFunctionBegin;
16242f5f1f90SHong Zhang   btval_den=btdense->v;
16252f5f1f90SHong Zhang   ierr     = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
16262f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
16272f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
16282f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1629d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
16302f5f1f90SHong Zhang       btcol = bj + bi[col];
16312f5f1f90SHong Zhang       btval = ba + bi[col];
16322f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1633d2853540SHong Zhang       for (j=0; j<anz; j++) {
16342f5f1f90SHong Zhang         brow            = btcol[j];
16352f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1636c8db22f6SHong Zhang       }
1637c8db22f6SHong Zhang     }
16382f5f1f90SHong Zhang     btval_den += m;
1639c8db22f6SHong Zhang   }
1640c8db22f6SHong Zhang   PetscFunctionReturn(0);
1641c8db22f6SHong Zhang }
1642c8db22f6SHong Zhang 
1643b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
16448972f759SHong Zhang {
16458972f759SHong Zhang   PetscErrorCode ierr;
1646b2d2b04fSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1647077f23c2SHong Zhang   PetscScalar    *ca_den,*ca_den_ptr,*ca=csp->a;
1648f99a636bSHong Zhang   PetscInt       k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1649e88777f2SHong Zhang   PetscInt       brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1650077f23c2SHong Zhang   PetscInt       nrows,*row,*idx;
1651077f23c2SHong Zhang   PetscInt       *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
16528972f759SHong Zhang 
16538972f759SHong Zhang   PetscFunctionBegin;
1654a3fe58edSHong Zhang   ierr   = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr);
1655f99a636bSHong Zhang 
1656077f23c2SHong Zhang   if (brows > 0) {
1657077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1658980ae229SHong Zhang     lstart = matcoloring->lstart;
1659eeb4fd02SHong Zhang     ierr = PetscMemzero(lstart,ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1660eeb4fd02SHong Zhang 
1661077f23c2SHong Zhang     row_end = brows;
1662eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1663077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1664077f23c2SHong Zhang       ca_den_ptr = ca_den;
1665980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1666eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1667eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1668eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1669eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1670eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1671eeb4fd02SHong Zhang             lstart[k] = l;
1672eeb4fd02SHong Zhang             break;
1673eeb4fd02SHong Zhang           } else {
1674077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1675eeb4fd02SHong Zhang           }
1676eeb4fd02SHong Zhang         }
1677077f23c2SHong Zhang         ca_den_ptr += m;
1678eeb4fd02SHong Zhang       }
1679077f23c2SHong Zhang       row_end += brows;
1680eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1681eeb4fd02SHong Zhang     }
1682077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1683077f23c2SHong Zhang     ca_den_ptr = ca_den;
1684077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1685077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1686077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1687077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1688077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1689077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1690077f23c2SHong Zhang       }
1691077f23c2SHong Zhang       ca_den_ptr += m;
1692077f23c2SHong Zhang     }
1693f99a636bSHong Zhang   }
1694f99a636bSHong Zhang 
1695a3fe58edSHong Zhang   ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1696f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1697077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1698f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1699e88777f2SHong Zhang   } else {
1700077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1701e88777f2SHong Zhang   }
1702f99a636bSHong Zhang #endif
17038972f759SHong Zhang   PetscFunctionReturn(0);
17048972f759SHong Zhang }
17058972f759SHong Zhang 
1706b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1707b1683b59SHong Zhang {
1708b1683b59SHong Zhang   PetscErrorCode ierr;
1709e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
17101a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1711b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1712b1683b59SHong Zhang   IS             *isa;
1713b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1714e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1715e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1716e88777f2SHong Zhang   PetscBool      flg;
1717b1683b59SHong Zhang 
1718b1683b59SHong Zhang   PetscFunctionBegin;
1719b1683b59SHong Zhang   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1720e99be685SHong Zhang 
17217ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1722e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1723b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1724e88777f2SHong Zhang   c->N      = Nbs;
1725e88777f2SHong Zhang   c->m      = c->M;
1726b1683b59SHong Zhang   c->rstart = 0;
1727e88777f2SHong Zhang   c->brows  = 100;
1728b1683b59SHong Zhang 
1729b1683b59SHong Zhang   c->ncolors = nis;
1730dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1731854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1732854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1733e88777f2SHong Zhang 
1734e88777f2SHong Zhang   brows = c->brows;
1735c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1736e88777f2SHong Zhang   if (flg) c->brows = brows;
1737eeb4fd02SHong Zhang   if (brows > 0) {
1738854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1739e88777f2SHong Zhang   }
17402205254eSKarl Rupp 
1741d2853540SHong Zhang   colorforrow[0] = 0;
1742d2853540SHong Zhang   rows_i         = rows;
1743f99a636bSHong Zhang   den2sp_i       = den2sp;
1744b1683b59SHong Zhang 
1745854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1746854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
17472205254eSKarl Rupp 
1748d2853540SHong Zhang   colorforcol[0] = 0;
1749d2853540SHong Zhang   columns_i      = columns;
1750d2853540SHong Zhang 
1751077f23c2SHong Zhang   /* get column-wise storage of mat */
1752077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1753b1683b59SHong Zhang 
1754b28d80bdSHong Zhang   cm   = c->m;
1755854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1756854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1757f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1758b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1759b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
17602205254eSKarl Rupp 
1761b1683b59SHong Zhang     c->ncolumns[i] = n;
1762b1683b59SHong Zhang     if (n) {
1763d2853540SHong Zhang       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1764b1683b59SHong Zhang     }
1765d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1766d2853540SHong Zhang     columns_i       += n;
1767b1683b59SHong Zhang 
1768b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1769e8354b3eSHong Zhang     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1770e99be685SHong Zhang 
1771b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1772b1683b59SHong Zhang       col     = is[j];
1773d2853540SHong Zhang       row_idx = cj + ci[col];
1774b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1775b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1776e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1777d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1778b1683b59SHong Zhang       }
1779b1683b59SHong Zhang     }
1780b1683b59SHong Zhang     /* count the number of hits */
1781b1683b59SHong Zhang     nrows = 0;
1782e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1783b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1784b1683b59SHong Zhang     }
1785b1683b59SHong Zhang     c->nrows[i]      = nrows;
1786d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1787d2853540SHong Zhang 
1788b1683b59SHong Zhang     nrows = 0;
1789b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1790b1683b59SHong Zhang       if (rowhit[j]) {
1791d2853540SHong Zhang         rows_i[nrows]   = j;
179212b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1793b1683b59SHong Zhang         nrows++;
1794b1683b59SHong Zhang       }
1795b1683b59SHong Zhang     }
1796e88777f2SHong Zhang     den2sp_i += nrows;
1797077f23c2SHong Zhang 
1798b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1799f99a636bSHong Zhang     rows_i += nrows;
1800b1683b59SHong Zhang   }
18010298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1802b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1803b1683b59SHong Zhang   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1804d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1805b28d80bdSHong Zhang 
1806d2853540SHong Zhang   c->colorforrow = colorforrow;
1807d2853540SHong Zhang   c->rows        = rows;
1808f99a636bSHong Zhang   c->den2sp      = den2sp;
1809d2853540SHong Zhang   c->colorforcol = colorforcol;
1810d2853540SHong Zhang   c->columns     = columns;
18112205254eSKarl Rupp 
1812f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1813b1683b59SHong Zhang   PetscFunctionReturn(0);
1814b1683b59SHong Zhang }
1815d013fc79Sandi selinger 
181673b67375Sandi selinger /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */
1817d013fc79Sandi selinger PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C)
1818d013fc79Sandi selinger {
1819d013fc79Sandi selinger   PetscErrorCode     ierr;
1820d013fc79Sandi selinger   PetscLogDouble     flops=0.0;
1821d013fc79Sandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
18222869b61bSandi selinger   const PetscInt     *ai=a->i,*bi=b->i;
1823d013fc79Sandi selinger   PetscInt           *ci,*cj,*cj_i;
1824d013fc79Sandi selinger   PetscScalar        *ca,*ca_i;
18252869b61bSandi selinger   PetscInt           b_maxmemrow,c_maxmem,a_col;
1826d013fc79Sandi selinger   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1827d013fc79Sandi selinger   PetscInt           i,k,ndouble=0;
1828d013fc79Sandi selinger   PetscReal          afill;
1829d013fc79Sandi selinger   PetscScalar        *c_row_val_dense;
1830d013fc79Sandi selinger   PetscBool          *c_row_idx_flags;
1831d013fc79Sandi selinger   PetscInt           *aj_i=a->j;
1832d013fc79Sandi selinger   PetscScalar        *aa_i=a->a;
1833d013fc79Sandi selinger 
1834d013fc79Sandi selinger   PetscFunctionBegin;
18352869b61bSandi selinger 
18362869b61bSandi selinger   /* Step 1: Determine upper bounds on memory for C and allocate memory */
18372869b61bSandi selinger   /* This should be enough for almost all matrices. If still more memory is needed, it is reallocated later. */
18382869b61bSandi selinger   c_maxmem    = 8*(ai[am]+bi[bm]);
18392869b61bSandi selinger   b_maxmemrow = PetscMin(bi[bm],bn);
1840d013fc79Sandi selinger   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1841d013fc79Sandi selinger   ierr  = PetscMalloc1(bn,&c_row_val_dense);CHKERRQ(ierr);
1842d013fc79Sandi selinger   ierr  = PetscMalloc1(bn,&c_row_idx_flags);CHKERRQ(ierr);
1843d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
1844d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&ca);CHKERRQ(ierr);
1845d013fc79Sandi selinger   ca_i  = ca;
1846d013fc79Sandi selinger   cj_i  = cj;
1847d013fc79Sandi selinger   ci[0] = 0;
184873b67375Sandi selinger   ierr  = PetscMemzero(c_row_val_dense,bn*sizeof(PetscScalar));CHKERRQ(ierr);
184973b67375Sandi selinger   ierr  = PetscMemzero(c_row_idx_flags,bn*sizeof(PetscBool));CHKERRQ(ierr);
1850d013fc79Sandi selinger   for (i=0; i<am; i++) {
1851d013fc79Sandi selinger     /* Step 2: Initialize the dense row vector for C  */
1852d013fc79Sandi selinger     const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1853d013fc79Sandi selinger     PetscInt       cnzi = 0;
1854d013fc79Sandi selinger     PetscInt       *bj_i;
1855d013fc79Sandi selinger     PetscScalar    *ba_i;
18562869b61bSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory
18572869b61bSandi selinger        Usually, there is enough memory in the first place, so this is not executed. */
18582869b61bSandi selinger     while (ci[i] + b_maxmemrow > c_maxmem) {
18592869b61bSandi selinger       c_maxmem *= 2;
18602869b61bSandi selinger       ndouble++;
18612869b61bSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);
18622869b61bSandi selinger       ierr = PetscRealloc(sizeof(PetscScalar)*c_maxmem,&ca);
18632869b61bSandi selinger     }
1864d013fc79Sandi selinger 
1865d013fc79Sandi selinger     /* Step 3: Do the numerical calculations */
1866d013fc79Sandi selinger     for (a_col=0; a_col<anzi; a_col++) {          /* iterate over all non zero values in a row of A */
1867d013fc79Sandi selinger       PetscInt       a_col_index = aj_i[a_col];
1868d013fc79Sandi selinger       const PetscInt bnzi        = bi[a_col_index+1] - bi[a_col_index];
1869d013fc79Sandi selinger       flops += 2*bnzi;
1870d013fc79Sandi selinger       bj_i   = b->j + bi[a_col_index];   /* points to the current row in bj */
1871d013fc79Sandi selinger       ba_i   = b->a + bi[a_col_index];   /* points to the current row in ba */
1872d013fc79Sandi selinger       for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */
1873d013fc79Sandi selinger         if (c_row_idx_flags[bj_i[k]] == PETSC_FALSE) {
18742869b61bSandi selinger           cj_i[cnzi++]             = bj_i[k];
1875d013fc79Sandi selinger           c_row_idx_flags[bj_i[k]] = PETSC_TRUE;
1876d013fc79Sandi selinger         }
1877d013fc79Sandi selinger         c_row_val_dense[bj_i[k]] += aa_i[a_col] * ba_i[k];
1878d013fc79Sandi selinger       }
1879d013fc79Sandi selinger     }
1880d013fc79Sandi selinger 
1881d013fc79Sandi selinger     /* Sort array */
18823353a62bSKarl Rupp     ierr = PetscSortInt(cnzi,cj_i);CHKERRQ(ierr);
1883d013fc79Sandi selinger     /* Step 4 */
1884d013fc79Sandi selinger     for (k=0; k<cnzi; k++) {
1885d013fc79Sandi selinger       ca_i[k]                  = c_row_val_dense[cj_i[k]];
1886d013fc79Sandi selinger       c_row_val_dense[cj_i[k]] = 0.;
1887d013fc79Sandi selinger       c_row_idx_flags[cj_i[k]] = PETSC_FALSE;
1888d013fc79Sandi selinger     }
1889d013fc79Sandi selinger     /* terminate current row */
1890d013fc79Sandi selinger     aa_i   += anzi;
1891d013fc79Sandi selinger     aj_i   += anzi;
1892d013fc79Sandi selinger     ca_i   += cnzi;
1893d013fc79Sandi selinger     cj_i   += cnzi;
1894d013fc79Sandi selinger     ci[i+1] = ci[i] + cnzi;
1895d013fc79Sandi selinger     flops  += cnzi;
1896d013fc79Sandi selinger   }
1897d013fc79Sandi selinger 
1898d013fc79Sandi selinger   /* Step 5 */
1899d013fc79Sandi selinger   /* Create the new matrix */
1900d013fc79Sandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
1901d013fc79Sandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
190202fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1903d013fc79Sandi selinger 
1904d013fc79Sandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1905d013fc79Sandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1906d013fc79Sandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
1907d013fc79Sandi selinger   c->a       = ca;
1908d013fc79Sandi selinger   c->free_a  = PETSC_TRUE;
1909d013fc79Sandi selinger   c->free_ij = PETSC_TRUE;
1910d013fc79Sandi selinger   c->nonew   = 0;
1911d013fc79Sandi selinger 
1912d013fc79Sandi selinger   /* set MatInfo */
1913d013fc79Sandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1914d013fc79Sandi selinger   if (afill < 1.0) afill = 1.0;
1915d013fc79Sandi selinger   c->maxnz                     = ci[am];
1916d013fc79Sandi selinger   c->nz                        = ci[am];
1917d013fc79Sandi selinger   (*C)->info.mallocs           = ndouble;
1918d013fc79Sandi selinger   (*C)->info.fill_ratio_given  = fill;
1919d013fc79Sandi selinger   (*C)->info.fill_ratio_needed = afill;
1920d013fc79Sandi selinger 
192173b67375Sandi selinger   ierr = PetscFree(c_row_val_dense);CHKERRQ(ierr);
192273b67375Sandi selinger   ierr = PetscFree(c_row_idx_flags);CHKERRQ(ierr);
1923d013fc79Sandi selinger 
1924d013fc79Sandi selinger   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1925d013fc79Sandi selinger   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1926d013fc79Sandi selinger   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1927d013fc79Sandi selinger   PetscFunctionReturn(0);
1928d013fc79Sandi selinger }
1929