xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 071fcb05f2a6aea8aef2c2090580530b9e9df77e)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith  /*
32499ec78SHong Zhang    Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith            C = A * B
5d50806bdSBarry Smith  */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown  #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown  #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown  #include <petscbt.h>
10af0996ceSBarry Smith  #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith  #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
135e5acdf2Sstefano_zampini 
14150d2497SBarry Smith  PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1538baddfdSBarry Smith  {
16dfbe8321SBarry Smith    PetscErrorCode ierr;
17df97dc6dSFande Kong 
18df97dc6dSFande Kong    PetscFunctionBegin;
19df97dc6dSFande Kong    if (scall == MAT_INITIAL_MATRIX) {
20df97dc6dSFande Kong      ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
21df97dc6dSFande Kong      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
22df97dc6dSFande Kong      ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
23df97dc6dSFande Kong    }
24df97dc6dSFande Kong 
25df97dc6dSFande Kong    ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
26df97dc6dSFande Kong    ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
27df97dc6dSFande Kong    ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
28df97dc6dSFande Kong    PetscFunctionReturn(0);
29df97dc6dSFande Kong  }
30df97dc6dSFande Kong 
31df97dc6dSFande Kong  PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
32df97dc6dSFande Kong  {
33df97dc6dSFande Kong    PetscErrorCode ierr;
34df97dc6dSFande Kong 
35df97dc6dSFande Kong    PetscFunctionBegin;
36df97dc6dSFande Kong    if (C->ops->matmultnumeric) {
37df97dc6dSFande Kong      ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
38df97dc6dSFande Kong    } else {
39df97dc6dSFande Kong      ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
40df97dc6dSFande Kong    }
41df97dc6dSFande Kong    PetscFunctionReturn(0);
42df97dc6dSFande Kong  }
43df97dc6dSFande Kong 
44df97dc6dSFande Kong  PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
45df97dc6dSFande Kong  {
46df97dc6dSFande Kong    PetscErrorCode ierr;
475e5acdf2Sstefano_zampini  #if !defined(PETSC_HAVE_HYPRE)
48d7ed1a4dSandi selinger    const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"};
49d013fc79Sandi selinger    PetscInt       nalg = 8;
50d7ed1a4dSandi selinger  #else
51d7ed1a4dSandi selinger    const char     *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"};
52d7ed1a4dSandi selinger    PetscInt       nalg = 9;
535e5acdf2Sstefano_zampini  #endif
546540a9cdSHong Zhang    PetscInt       alg = 0; /* set default algorithm */
555c66b693SKris Buschelman 
565c66b693SKris Buschelman    PetscFunctionBegin;
57715a5346SHong Zhang    ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
585e5acdf2Sstefano_zampini    ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr);
59d8bbc50fSBarry Smith    ierr = PetscOptionsEnd();CHKERRQ(ierr);
606540a9cdSHong Zhang    switch (alg) {
616540a9cdSHong Zhang    case 1:
62aacf854cSBarry Smith      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
636540a9cdSHong Zhang      break;
646540a9cdSHong Zhang    case 2:
656540a9cdSHong Zhang      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
666540a9cdSHong Zhang      break;
676540a9cdSHong Zhang    case 3:
680ced3a2bSJed Brown      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
696540a9cdSHong Zhang      break;
706540a9cdSHong Zhang    case 4:
718a07c6f1SJed Brown      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
726540a9cdSHong Zhang      break;
736540a9cdSHong Zhang    case 5:
7458cf0668SJed Brown      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
756540a9cdSHong Zhang      break;
765e5acdf2Sstefano_zampini    case 6:
77d013fc79Sandi selinger      ierr = MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);CHKERRQ(ierr);
78d013fc79Sandi selinger      break;
79d013fc79Sandi selinger    case 7:
80d7ed1a4dSandi selinger      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
81d7ed1a4dSandi selinger      break;
82d7ed1a4dSandi selinger  #if defined(PETSC_HAVE_HYPRE)
83d7ed1a4dSandi selinger    case 8:
845e5acdf2Sstefano_zampini      ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
855e5acdf2Sstefano_zampini      break;
865e5acdf2Sstefano_zampini  #endif
876540a9cdSHong Zhang    default:
88df97dc6dSFande Kong      ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
896540a9cdSHong Zhang      break;
9025023028SHong Zhang    }
915c66b693SKris Buschelman    PetscFunctionReturn(0);
925c66b693SKris Buschelman  }
931c24bd37SHong Zhang 
94df97dc6dSFande Kong  PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
95b561aa9dSHong Zhang  {
96b561aa9dSHong Zhang    PetscErrorCode     ierr;
97b561aa9dSHong Zhang    Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
98971236abSHong Zhang    PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
99c1ba5575SJed Brown    PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
100b561aa9dSHong Zhang    PetscReal          afill;
101eca6b86aSHong Zhang    PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
102eca6b86aSHong Zhang    PetscTable         ta;
103fb908031SHong Zhang    PetscBT            lnkbt;
1040298fd71SBarry Smith    PetscFreeSpaceList free_space=NULL,current_space=NULL;
105b561aa9dSHong Zhang 
106b561aa9dSHong Zhang    PetscFunctionBegin;
107bd958071SHong Zhang    /* Get ci and cj */
108bd958071SHong Zhang    /*---------------*/
109fb908031SHong Zhang    /* Allocate ci array, arrays for fill computation and */
110fb908031SHong Zhang    /* free space for accumulating nonzero column info */
111854ce69bSBarry Smith    ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
112fb908031SHong Zhang    ci[0] = 0;
113fb908031SHong Zhang 
114fb908031SHong Zhang    /* create and initialize a linked list */
115c373ccc6SHong Zhang    ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
116c373ccc6SHong Zhang    MatRowMergeMax_SeqAIJ(b,bm,ta);
117eca6b86aSHong Zhang    ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
118eca6b86aSHong Zhang    ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
119eca6b86aSHong Zhang 
120eca6b86aSHong Zhang    ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
121fb908031SHong Zhang 
122fb908031SHong Zhang    /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
123f91af8c7SBarry Smith    ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1242205254eSKarl Rupp 
125fb908031SHong Zhang    current_space = free_space;
126fb908031SHong Zhang 
127fb908031SHong Zhang    /* Determine ci and cj */
128fb908031SHong Zhang    for (i=0; i<am; i++) {
129fb908031SHong Zhang      anzi = ai[i+1] - ai[i];
130fb908031SHong Zhang      aj   = a->j + ai[i];
131fb908031SHong Zhang      for (j=0; j<anzi; j++) {
132fb908031SHong Zhang        brow = aj[j];
133fb908031SHong Zhang        bnzj = bi[brow+1] - bi[brow];
134fb908031SHong Zhang        bj   = b->j + bi[brow];
135fb908031SHong Zhang        /* add non-zero cols of B into the sorted linked list lnk */
136fb908031SHong Zhang        ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
137fb908031SHong Zhang      }
138fb908031SHong Zhang      cnzi = lnk[0];
139fb908031SHong Zhang 
140fb908031SHong Zhang      /* If free space is not available, make more free space */
141fb908031SHong Zhang      /* Double the amount of total space in the list */
142fb908031SHong Zhang      if (current_space->local_remaining<cnzi) {
143f91af8c7SBarry Smith        ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
144fb908031SHong Zhang        ndouble++;
145fb908031SHong Zhang      }
146fb908031SHong Zhang 
147fb908031SHong Zhang      /* Copy data into free space, then initialize lnk */
148fb908031SHong Zhang      ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1492205254eSKarl Rupp 
150fb908031SHong Zhang      current_space->array           += cnzi;
151fb908031SHong Zhang      current_space->local_used      += cnzi;
152fb908031SHong Zhang      current_space->local_remaining -= cnzi;
1532205254eSKarl Rupp 
154fb908031SHong Zhang      ci[i+1] = ci[i] + cnzi;
155fb908031SHong Zhang    }
156fb908031SHong Zhang 
157fb908031SHong Zhang    /* Column indices are in the list of free space */
158fb908031SHong Zhang    /* Allocate space for cj, initialize cj, and */
159fb908031SHong Zhang    /* destroy list of free space and other temporary array(s) */
160854ce69bSBarry Smith    ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
161fb908031SHong Zhang    ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
162fb908031SHong Zhang    ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
163b561aa9dSHong Zhang 
16426be0446SHong Zhang    /* put together the new symbolic matrix */
165ce94432eSBarry Smith    ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
16633d57670SJed Brown    ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
16702fe1965SBarry Smith    ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
16858c24d83SHong Zhang 
16958c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
17058c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
17158c24d83SHong Zhang   c                         = (Mat_SeqAIJ*)((*C)->data);
172aa1aec99SHong Zhang   c->free_a                 = PETSC_FALSE;
173e6b907acSBarry Smith   c->free_ij                = PETSC_TRUE;
17458c24d83SHong Zhang   c->nonew                  = 0;
175df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; /* fast, needs non-scalable O(bn) array 'abdense' */
1760b7e3e3dSHong Zhang 
1777212da7cSHong Zhang   /* set MatInfo */
1787212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
179f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
1807212da7cSHong Zhang   c->maxnz                     = ci[am];
1817212da7cSHong Zhang   c->nz                        = ci[am];
182fb908031SHong Zhang   (*C)->info.mallocs           = ndouble;
1837212da7cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
1847212da7cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
1857212da7cSHong Zhang 
1867212da7cSHong Zhang #if defined(PETSC_USE_INFO)
1877212da7cSHong Zhang   if (ci[am]) {
18857622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
18957622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
190f2b054eeSHong Zhang   } else {
191f2b054eeSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
192be0fcf8dSHong Zhang   }
193f2b054eeSHong Zhang #endif
19458c24d83SHong Zhang   PetscFunctionReturn(0);
19558c24d83SHong Zhang }
196d50806bdSBarry Smith 
197df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
198d50806bdSBarry Smith {
199dfbe8321SBarry Smith   PetscErrorCode ierr;
200d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
201d50806bdSBarry Smith   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
202d50806bdSBarry Smith   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
203d50806bdSBarry Smith   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
20438baddfdSBarry Smith   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
205c58ca2e3SHong Zhang   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
206519eb980SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
207aa1aec99SHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
208aa1aec99SHong Zhang   PetscScalar    *ab_dense;
209d50806bdSBarry Smith 
210d50806bdSBarry Smith   PetscFunctionBegin;
211aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
212854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
213aa1aec99SHong Zhang     c->a      = ca;
214aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
215aa1aec99SHong Zhang   } else {
216aa1aec99SHong Zhang     ca        = c->a;
217d90d86d0SMatthew G. Knepley   }
218d90d86d0SMatthew G. Knepley   if (!c->matmult_abdense) {
2191795a4d1SJed Brown     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
220d90d86d0SMatthew G. Knepley     c->matmult_abdense = ab_dense;
221d90d86d0SMatthew G. Knepley   } else {
222aa1aec99SHong Zhang     ab_dense = c->matmult_abdense;
223aa1aec99SHong Zhang   }
224aa1aec99SHong Zhang 
225c124e916SHong Zhang   /* clean old values in C */
226580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
227d50806bdSBarry Smith   /* Traverse A row-wise. */
228d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
229d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
230d50806bdSBarry Smith   for (i=0; i<am; i++) {
231d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
232d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
233519eb980SHong Zhang       brow = aj[j];
234d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
235d50806bdSBarry Smith       bjj  = bj + bi[brow];
236d50806bdSBarry Smith       baj  = ba + bi[brow];
237519eb980SHong Zhang       /* perform dense axpy */
23836ec6d2dSHong Zhang       valtmp = aa[j];
239519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
24036ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
241519eb980SHong Zhang       }
242519eb980SHong Zhang       flops += 2*bnzi;
243519eb980SHong Zhang     }
244c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
245c58ca2e3SHong Zhang 
246c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
247519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
248519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
249519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
250519eb980SHong Zhang     }
251519eb980SHong Zhang     flops += cnzi;
252519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
253519eb980SHong Zhang   }
254c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
255c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
256c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
257c58ca2e3SHong Zhang   PetscFunctionReturn(0);
258c58ca2e3SHong Zhang }
259c58ca2e3SHong Zhang 
26025023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
261c58ca2e3SHong Zhang {
262c58ca2e3SHong Zhang   PetscErrorCode ierr;
263c58ca2e3SHong Zhang   PetscLogDouble flops=0.0;
264c58ca2e3SHong Zhang   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
265c58ca2e3SHong Zhang   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
266c58ca2e3SHong Zhang   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
267c58ca2e3SHong Zhang   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
268c58ca2e3SHong Zhang   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
269c58ca2e3SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
27036ec6d2dSHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
271c58ca2e3SHong Zhang   PetscInt       nextb;
272c58ca2e3SHong Zhang 
273c58ca2e3SHong Zhang   PetscFunctionBegin;
274cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
275cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
276cf742fccSHong Zhang     c->a      = ca;
277cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
278cf742fccSHong Zhang   }
279cf742fccSHong Zhang 
280c58ca2e3SHong Zhang   /* clean old values in C */
281580bdb30SBarry Smith   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
282c58ca2e3SHong Zhang   /* Traverse A row-wise. */
283c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
284c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
285519eb980SHong Zhang   for (i=0; i<am; i++) {
286519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
287519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
288519eb980SHong Zhang     for (j=0; j<anzi; j++) {
289519eb980SHong Zhang       brow = aj[j];
290519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
291519eb980SHong Zhang       bjj  = bj + bi[brow];
292519eb980SHong Zhang       baj  = ba + bi[brow];
293519eb980SHong Zhang       /* perform sparse axpy */
29436ec6d2dSHong Zhang       valtmp = aa[j];
295c124e916SHong Zhang       nextb  = 0;
296c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
297c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
29836ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
299c124e916SHong Zhang         }
300d50806bdSBarry Smith       }
301d50806bdSBarry Smith       flops += 2*bnzi;
302d50806bdSBarry Smith     }
303519eb980SHong Zhang     aj += anzi; aa += anzi;
304519eb980SHong Zhang     cj += cnzi; ca += cnzi;
305d50806bdSBarry Smith   }
306c58ca2e3SHong Zhang 
307716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
308716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
309d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
310d50806bdSBarry Smith   PetscFunctionReturn(0);
311d50806bdSBarry Smith }
312bc011b1eSHong Zhang 
3133c50cad2SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
31425296bd5SBarry Smith {
31525296bd5SBarry Smith   PetscErrorCode     ierr;
31625296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
31725296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3183c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
31925296bd5SBarry Smith   MatScalar          *ca;
32025296bd5SBarry Smith   PetscReal          afill;
321eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
322eca6b86aSHong Zhang   PetscTable         ta;
3230298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
32425296bd5SBarry Smith 
32525296bd5SBarry Smith   PetscFunctionBegin;
3263c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3273c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3283c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
329854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
3303c50cad2SHong Zhang   ci[0] = 0;
3313c50cad2SHong Zhang 
3323c50cad2SHong Zhang   /* create and initialize a linked list */
333c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
334c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
335eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
336eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
337eca6b86aSHong Zhang 
338eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
3393c50cad2SHong Zhang 
3403c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
341f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
3423c50cad2SHong Zhang   current_space = free_space;
3433c50cad2SHong Zhang 
3443c50cad2SHong Zhang   /* Determine ci and cj */
3453c50cad2SHong Zhang   for (i=0; i<am; i++) {
3463c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
3473c50cad2SHong Zhang     aj   = a->j + ai[i];
3483c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
3493c50cad2SHong Zhang       brow = aj[j];
3503c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
3513c50cad2SHong Zhang       bj   = b->j + bi[brow];
3523c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
3533c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
3543c50cad2SHong Zhang     }
3553c50cad2SHong Zhang     cnzi = lnk[1];
3563c50cad2SHong Zhang 
3573c50cad2SHong Zhang     /* If free space is not available, make more free space */
3583c50cad2SHong Zhang     /* Double the amount of total space in the list */
3593c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
360f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
3613c50cad2SHong Zhang       ndouble++;
3623c50cad2SHong Zhang     }
3633c50cad2SHong Zhang 
3643c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
3653c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
3662205254eSKarl Rupp 
3673c50cad2SHong Zhang     current_space->array           += cnzi;
3683c50cad2SHong Zhang     current_space->local_used      += cnzi;
3693c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
3702205254eSKarl Rupp 
3713c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
3723c50cad2SHong Zhang   }
3733c50cad2SHong Zhang 
3743c50cad2SHong Zhang   /* Column indices are in the list of free space */
3753c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
3763c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
377854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
3783c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
3793c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
38025296bd5SBarry Smith 
38125296bd5SBarry Smith   /* Allocate space for ca */
382580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
38325296bd5SBarry Smith 
38425296bd5SBarry Smith   /* put together the new symbolic matrix */
385ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
38633d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
38702fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
38825296bd5SBarry Smith 
38925296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
39025296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
39125296bd5SBarry Smith   c          = (Mat_SeqAIJ*)((*C)->data);
39225296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
39325296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
39425296bd5SBarry Smith   c->nonew   = 0;
3952205254eSKarl Rupp 
39625296bd5SBarry Smith   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
39725296bd5SBarry Smith 
39825296bd5SBarry Smith   /* set MatInfo */
39925296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
40025296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
40125296bd5SBarry Smith   c->maxnz                     = ci[am];
40225296bd5SBarry Smith   c->nz                        = ci[am];
4033c50cad2SHong Zhang   (*C)->info.mallocs           = ndouble;
40425296bd5SBarry Smith   (*C)->info.fill_ratio_given  = fill;
40525296bd5SBarry Smith   (*C)->info.fill_ratio_needed = afill;
40625296bd5SBarry Smith 
40725296bd5SBarry Smith #if defined(PETSC_USE_INFO)
40825296bd5SBarry Smith   if (ci[am]) {
40957622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
41057622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
41125296bd5SBarry Smith   } else {
41225296bd5SBarry Smith     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
41325296bd5SBarry Smith   }
41425296bd5SBarry Smith #endif
41525296bd5SBarry Smith   PetscFunctionReturn(0);
41625296bd5SBarry Smith }
41725296bd5SBarry Smith 
41825296bd5SBarry Smith 
41925023028SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
420e9e4536cSHong Zhang {
421e9e4536cSHong Zhang   PetscErrorCode     ierr;
422e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
423bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
42425c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
425e9e4536cSHong Zhang   MatScalar          *ca;
426e9e4536cSHong Zhang   PetscReal          afill;
427eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
428eca6b86aSHong Zhang   PetscTable         ta;
4290298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
430e9e4536cSHong Zhang 
431e9e4536cSHong Zhang   PetscFunctionBegin;
432bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
433bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
434bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
435854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
436bd958071SHong Zhang   ci[0] = 0;
437bd958071SHong Zhang 
438bd958071SHong Zhang   /* create and initialize a linked list */
439c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
440c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
441eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
442eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
443eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
444bd958071SHong Zhang 
445bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
446f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
447bd958071SHong Zhang   current_space = free_space;
448bd958071SHong Zhang 
449bd958071SHong Zhang   /* Determine ci and cj */
450bd958071SHong Zhang   for (i=0; i<am; i++) {
451bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
452bd958071SHong Zhang     aj   = a->j + ai[i];
453bd958071SHong Zhang     for (j=0; j<anzi; j++) {
454bd958071SHong Zhang       brow = aj[j];
455bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
456bd958071SHong Zhang       bj   = b->j + bi[brow];
457bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
458bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
459bd958071SHong Zhang     }
460bd958071SHong Zhang     cnzi = lnk[0];
461bd958071SHong Zhang 
462bd958071SHong Zhang     /* If free space is not available, make more free space */
463bd958071SHong Zhang     /* Double the amount of total space in the list */
464bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
465f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
466bd958071SHong Zhang       ndouble++;
467bd958071SHong Zhang     }
468bd958071SHong Zhang 
469bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
470bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4712205254eSKarl Rupp 
472bd958071SHong Zhang     current_space->array           += cnzi;
473bd958071SHong Zhang     current_space->local_used      += cnzi;
474bd958071SHong Zhang     current_space->local_remaining -= cnzi;
4752205254eSKarl Rupp 
476bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
477bd958071SHong Zhang   }
478bd958071SHong Zhang 
479bd958071SHong Zhang   /* Column indices are in the list of free space */
480bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
481bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
482854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
483bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
484bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
485e9e4536cSHong Zhang 
486e9e4536cSHong Zhang   /* Allocate space for ca */
487bd958071SHong Zhang   /*-----------------------*/
488580bdb30SBarry Smith   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
489e9e4536cSHong Zhang 
490e9e4536cSHong Zhang   /* put together the new symbolic matrix */
491ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
49233d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
49302fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
494e9e4536cSHong Zhang 
495e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
496e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
497e9e4536cSHong Zhang   c          = (Mat_SeqAIJ*)((*C)->data);
498e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
499e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
500e9e4536cSHong Zhang   c->nonew   = 0;
5012205254eSKarl Rupp 
50225023028SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
503e9e4536cSHong Zhang 
504e9e4536cSHong Zhang   /* set MatInfo */
505e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
506e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
507e9e4536cSHong Zhang   c->maxnz                     = ci[am];
508e9e4536cSHong Zhang   c->nz                        = ci[am];
509bd958071SHong Zhang   (*C)->info.mallocs           = ndouble;
510e9e4536cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
511e9e4536cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
512e9e4536cSHong Zhang 
513e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
514e9e4536cSHong Zhang   if (ci[am]) {
51557622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
51657622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
517e9e4536cSHong Zhang   } else {
518e9e4536cSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
519e9e4536cSHong Zhang   }
520e9e4536cSHong Zhang #endif
521e9e4536cSHong Zhang   PetscFunctionReturn(0);
522e9e4536cSHong Zhang }
523e9e4536cSHong Zhang 
5240ced3a2bSJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
5250ced3a2bSJed Brown {
5260ced3a2bSJed Brown   PetscErrorCode     ierr;
5270ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
5280ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
5290ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
5300ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
5310ced3a2bSJed Brown   PetscReal          afill;
5320ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
5330298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
5340ced3a2bSJed Brown   PetscHeap          h;
5350ced3a2bSJed Brown 
5360ced3a2bSJed Brown   PetscFunctionBegin;
537cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
5380ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
5390ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
540854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
5410ced3a2bSJed Brown   ci[0] = 0;
5420ced3a2bSJed Brown 
5430ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
544f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
5450ced3a2bSJed Brown   current_space = free_space;
5460ced3a2bSJed Brown 
5470ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
548785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
5490ced3a2bSJed Brown 
5500ced3a2bSJed Brown   /* Determine ci and cj */
5510ced3a2bSJed Brown   for (i=0; i<am; i++) {
5520ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
5530ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
5540ced3a2bSJed Brown     ci[i+1] = ci[i];
5550ced3a2bSJed Brown     /* Populate the min heap */
5560ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
5570ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
5580ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
5590ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
5600ced3a2bSJed Brown       }
5610ced3a2bSJed Brown     }
5620ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
5630ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5640ced3a2bSJed Brown     while (j >= 0) {
5650ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
566f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
5670ced3a2bSJed Brown         ndouble++;
5680ced3a2bSJed Brown       }
5690ced3a2bSJed Brown       *(current_space->array++) = col;
5700ced3a2bSJed Brown       current_space->local_used++;
5710ced3a2bSJed Brown       current_space->local_remaining--;
5720ced3a2bSJed Brown       ci[i+1]++;
5730ced3a2bSJed Brown 
5740ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
5750ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
5760ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
5770ced3a2bSJed Brown         PetscInt j2,col2;
5780ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
5790ced3a2bSJed Brown         if (col2 != col) break;
5800ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
5810ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
5820ced3a2bSJed Brown       }
5830ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
5840ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
5850ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5860ced3a2bSJed Brown     }
5870ced3a2bSJed Brown   }
5880ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
5890ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
5900ced3a2bSJed Brown 
5910ced3a2bSJed Brown   /* Column indices are in the list of free space */
5920ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
5930ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
594785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
5950ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
5960ced3a2bSJed Brown 
5970ced3a2bSJed Brown   /* put together the new symbolic matrix */
598ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
59933d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
60002fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
6010ced3a2bSJed Brown 
6020ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6030ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6040ced3a2bSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
6050ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6060ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6070ced3a2bSJed Brown   c->nonew   = 0;
60826fbe8dcSKarl Rupp 
609df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6100ced3a2bSJed Brown 
6110ced3a2bSJed Brown   /* set MatInfo */
6120ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6130ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6140ced3a2bSJed Brown   c->maxnz                     = ci[am];
6150ced3a2bSJed Brown   c->nz                        = ci[am];
6160ced3a2bSJed Brown   (*C)->info.mallocs           = ndouble;
6170ced3a2bSJed Brown   (*C)->info.fill_ratio_given  = fill;
6180ced3a2bSJed Brown   (*C)->info.fill_ratio_needed = afill;
6190ced3a2bSJed Brown 
6200ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6210ced3a2bSJed Brown   if (ci[am]) {
62257622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
62357622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
6240ced3a2bSJed Brown   } else {
6250ced3a2bSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
6260ced3a2bSJed Brown   }
6270ced3a2bSJed Brown #endif
6280ced3a2bSJed Brown   PetscFunctionReturn(0);
6290ced3a2bSJed Brown }
630e9e4536cSHong Zhang 
6318a07c6f1SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
6328a07c6f1SJed Brown {
6338a07c6f1SJed Brown   PetscErrorCode     ierr;
6348a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6358a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6368a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
6378a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6388a07c6f1SJed Brown   PetscReal          afill;
6398a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
6400298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6418a07c6f1SJed Brown   PetscHeap          h;
6428a07c6f1SJed Brown   PetscBT            bt;
6438a07c6f1SJed Brown 
6448a07c6f1SJed Brown   PetscFunctionBegin;
645cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
6468a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
6478a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
648854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6498a07c6f1SJed Brown   ci[0] = 0;
6508a07c6f1SJed Brown 
6518a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
652f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6532205254eSKarl Rupp 
6548a07c6f1SJed Brown   current_space = free_space;
6558a07c6f1SJed Brown 
6568a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
657785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6588a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
6598a07c6f1SJed Brown 
6608a07c6f1SJed Brown   /* Determine ci and cj */
6618a07c6f1SJed Brown   for (i=0; i<am; i++) {
6628a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6638a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6648a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
6658a07c6f1SJed Brown     ci[i+1] = ci[i];
6668a07c6f1SJed Brown     /* Populate the min heap */
6678a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
6688a07c6f1SJed Brown       PetscInt brow = acol[j];
6698a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
6708a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6718a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6728a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6738a07c6f1SJed Brown           bb[j]++;
6748a07c6f1SJed Brown           break;
6758a07c6f1SJed Brown         }
6768a07c6f1SJed Brown       }
6778a07c6f1SJed Brown     }
6788a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
6798a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6808a07c6f1SJed Brown     while (j >= 0) {
6818a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6820298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
683f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6848a07c6f1SJed Brown         ndouble++;
6858a07c6f1SJed Brown       }
6868a07c6f1SJed Brown       *(current_space->array++) = col;
6878a07c6f1SJed Brown       current_space->local_used++;
6888a07c6f1SJed Brown       current_space->local_remaining--;
6898a07c6f1SJed Brown       ci[i+1]++;
6908a07c6f1SJed Brown 
6918a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
6928a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
6938a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6948a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6958a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6968a07c6f1SJed Brown           bb[j]++;
6978a07c6f1SJed Brown           break;
6988a07c6f1SJed Brown         }
6998a07c6f1SJed Brown       }
7008a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
7018a07c6f1SJed Brown     }
7028a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7038a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
7048a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7058a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
7068a07c6f1SJed Brown     }
7078a07c6f1SJed Brown   }
7088a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
7098a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
7108a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7118a07c6f1SJed Brown 
7128a07c6f1SJed Brown   /* Column indices are in the list of free space */
7138a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7148a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
715785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7168a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7178a07c6f1SJed Brown 
7188a07c6f1SJed Brown   /* put together the new symbolic matrix */
719ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
72033d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
72102fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
7228a07c6f1SJed Brown 
7238a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7248a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7258a07c6f1SJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
7268a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7278a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7288a07c6f1SJed Brown   c->nonew   = 0;
72926fbe8dcSKarl Rupp 
730df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7318a07c6f1SJed Brown 
7328a07c6f1SJed Brown   /* set MatInfo */
7338a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7348a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7358a07c6f1SJed Brown   c->maxnz                     = ci[am];
7368a07c6f1SJed Brown   c->nz                        = ci[am];
7378a07c6f1SJed Brown   (*C)->info.mallocs           = ndouble;
7388a07c6f1SJed Brown   (*C)->info.fill_ratio_given  = fill;
7398a07c6f1SJed Brown   (*C)->info.fill_ratio_needed = afill;
7408a07c6f1SJed Brown 
7418a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7428a07c6f1SJed Brown   if (ci[am]) {
74357622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
74457622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7458a07c6f1SJed Brown   } else {
7468a07c6f1SJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
7478a07c6f1SJed Brown   }
7488a07c6f1SJed Brown #endif
7498a07c6f1SJed Brown   PetscFunctionReturn(0);
7508a07c6f1SJed Brown }
7518a07c6f1SJed Brown 
752d7ed1a4dSandi selinger 
753d7ed1a4dSandi selinger PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C)
754d7ed1a4dSandi selinger {
755d7ed1a4dSandi selinger   PetscErrorCode     ierr;
756d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
757d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
758d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
759d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
760d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
761d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
762d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
763d7ed1a4dSandi selinger   PetscInt           window[8];
764d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
765d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
766d7ed1a4dSandi selinger   PetscReal          afill;
767f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
7687660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
769d7ed1a4dSandi selinger 
770d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
771d7ed1a4dSandi selinger              Because of the way virtual memory works,
772d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
773d7ed1a4dSandi selinger   PetscFunctionBegin;
774d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
775d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
776d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
777d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
778d7ed1a4dSandi selinger     a_rownnz = 0;
779d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
780d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
781d7ed1a4dSandi selinger       if (a_rownnz > bn) {
782d7ed1a4dSandi selinger         a_rownnz = bn;
783d7ed1a4dSandi selinger         break;
784d7ed1a4dSandi selinger       }
785d7ed1a4dSandi selinger     }
786d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
787d7ed1a4dSandi selinger   }
788d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
789d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
790f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
791f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
792d7ed1a4dSandi selinger 
7937660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
7947660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
795d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
796d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
797d7ed1a4dSandi selinger 
798d7ed1a4dSandi selinger   ci_nnz       = 0;
799d7ed1a4dSandi selinger   ci[0]        = 0;
800d7ed1a4dSandi selinger   worki_L1[0]  = 0;
801d7ed1a4dSandi selinger   worki_L2[0]  = 0;
802d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
803d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
804d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
805d7ed1a4dSandi selinger     rowsleft             = anzi;
806d7ed1a4dSandi selinger     inputcol_L1          = acol;
807d7ed1a4dSandi selinger     L2_nnz               = 0;
8087660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8097660c4dbSandi selinger     worki_L2[1]          = 0;
81008fe1b3cSKarl Rupp     outputi_nnz          = 0;
811d7ed1a4dSandi selinger 
812d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
813d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
814d7ed1a4dSandi selinger       c_maxmem *= 2;
815d7ed1a4dSandi selinger       ndouble++;
816d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
817d7ed1a4dSandi selinger     }
818d7ed1a4dSandi selinger 
819d7ed1a4dSandi selinger     while (rowsleft) {
820d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
821d7ed1a4dSandi selinger       L1_nrows    = 0;
8227660c4dbSandi selinger       L1_nnz      = 0;
823d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
824d7ed1a4dSandi selinger       inputi      = bi;
825d7ed1a4dSandi selinger       inputj      = bj;
826d7ed1a4dSandi selinger 
827d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
828d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
829f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
830d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
831d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
832d7ed1a4dSandi selinger          window_min  = bn;                                                   \
8337660c4dbSandi selinger          outputi_nnz = 0;                                                    \
8347660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
835d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
836d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
837d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
838d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
839d7ed1a4dSandi selinger          }                                                                   \
840d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
841d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
842d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
843d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
844d7ed1a4dSandi selinger            window_min = bn;                                                  \
845d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
846d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
847d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
848d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
849d7ed1a4dSandi selinger              }                                                               \
850d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
851d7ed1a4dSandi selinger            }                                                                 \
852d7ed1a4dSandi selinger          }
853d7ed1a4dSandi selinger 
854d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
855d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
856d7ed1a4dSandi selinger       while (L1_rowsleft) {
8577660c4dbSandi selinger         outputi_nnz = 0;
8587660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
8597660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
8607660c4dbSandi selinger 
861d7ed1a4dSandi selinger         switch (L1_rowsleft) {
862d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
863d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
864d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
865d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
866d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
867d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
868d7ed1a4dSandi selinger                  break;
869d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
870d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
871d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
872d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
873d7ed1a4dSandi selinger                  break;
874d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
875d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
876d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
877d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
878d7ed1a4dSandi selinger                  break;
879d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
880d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
881d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
882d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
883d7ed1a4dSandi selinger                  break;
884d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
885d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
886d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
887d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
888d7ed1a4dSandi selinger                  break;
889d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
890d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
891d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
892d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
893d7ed1a4dSandi selinger                  break;
894d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
895d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
896d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
897d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
898d7ed1a4dSandi selinger                  break;
899d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
900d7ed1a4dSandi selinger                  inputcol    += 8;
901d7ed1a4dSandi selinger                  rowsleft    -= 8;
902d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
903d7ed1a4dSandi selinger                  break;
904d7ed1a4dSandi selinger         }
905d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9067660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9077660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
908d7ed1a4dSandi selinger       }
909d7ed1a4dSandi selinger 
910d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
911d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
912d7ed1a4dSandi selinger       if (anzi > 8) {
913d7ed1a4dSandi selinger         inputi      = worki_L1;
914d7ed1a4dSandi selinger         inputj      = workj_L1;
915d7ed1a4dSandi selinger         inputcol    = workcol;
916d7ed1a4dSandi selinger         outputi_nnz = 0;
917d7ed1a4dSandi selinger 
918d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
919d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
920d7ed1a4dSandi selinger 
921d7ed1a4dSandi selinger         switch (L1_nrows) {
922d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
923d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
924d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
925d7ed1a4dSandi selinger                  break;
926d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
927d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
928d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
929d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
930d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
931d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
932d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
933d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
934d7ed1a4dSandi selinger         }
935d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
936d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
937d7ed1a4dSandi selinger 
9387660c4dbSandi selinger         /************************ L E V E L  3 **********************/
9397660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
940d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
941d7ed1a4dSandi selinger           inputi      = worki_L2;
942d7ed1a4dSandi selinger           inputj      = workj_L2;
943d7ed1a4dSandi selinger           inputcol    = workcol;
944d7ed1a4dSandi selinger           outputi_nnz = 0;
945f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
946d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
947d7ed1a4dSandi selinger           switch (L2_nrows) {
948d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
949d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
950d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
951d7ed1a4dSandi selinger                    break;
952d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
953d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
954d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
955d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
956d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
957d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
958d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
959d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
960d7ed1a4dSandi selinger           }
961d7ed1a4dSandi selinger           L2_nrows    = 1;
9627660c4dbSandi selinger           L2_nnz      = outputi_nnz;
9637660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
9647660c4dbSandi selinger           /* Copy to workj_L2 */
965d7ed1a4dSandi selinger           if (rowsleft) {
9667660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
967d7ed1a4dSandi selinger           }
968d7ed1a4dSandi selinger         }
969d7ed1a4dSandi selinger       }
970d7ed1a4dSandi selinger     }  /* while (rowsleft) */
971d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
972d7ed1a4dSandi selinger 
973d7ed1a4dSandi selinger     /* terminate current row */
974d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
975d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
976d7ed1a4dSandi selinger   }
977d7ed1a4dSandi selinger 
978d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
979d7ed1a4dSandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
980d7ed1a4dSandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
981f83700f2Sandi selinger   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
982d7ed1a4dSandi selinger 
983d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
984d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
985d7ed1a4dSandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
986d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
987d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
988d7ed1a4dSandi selinger   c->nonew   = 0;
989d7ed1a4dSandi selinger 
990df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
991d7ed1a4dSandi selinger 
992d7ed1a4dSandi selinger   /* set MatInfo */
993d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
994d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
995d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
996d7ed1a4dSandi selinger   c->nz                        = ci[am];
997d7ed1a4dSandi selinger   (*C)->info.mallocs           = ndouble;
998d7ed1a4dSandi selinger   (*C)->info.fill_ratio_given  = fill;
999d7ed1a4dSandi selinger   (*C)->info.fill_ratio_needed = afill;
1000d7ed1a4dSandi selinger 
1001d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1002d7ed1a4dSandi selinger   if (ci[am]) {
1003d7ed1a4dSandi selinger     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1004d7ed1a4dSandi selinger     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1005d7ed1a4dSandi selinger   } else {
1006d7ed1a4dSandi selinger     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
1007d7ed1a4dSandi selinger   }
1008d7ed1a4dSandi selinger #endif
1009d7ed1a4dSandi selinger 
1010d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1011d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1012d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1013f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1014d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1015d7ed1a4dSandi selinger }
1016d7ed1a4dSandi selinger 
1017cd093f1dSJed Brown /* concatenate unique entries and then sort */
1018df97dc6dSFande Kong PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat *C)
1019cd093f1dSJed Brown {
1020cd093f1dSJed Brown   PetscErrorCode     ierr;
1021cd093f1dSJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1022cd093f1dSJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1023cd093f1dSJed Brown   PetscInt           *ci,*cj;
1024cd093f1dSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1025cd093f1dSJed Brown   PetscReal          afill;
1026cd093f1dSJed Brown   PetscInt           i,j,ndouble = 0;
1027cd093f1dSJed Brown   PetscSegBuffer     seg,segrow;
1028cd093f1dSJed Brown   char               *seen;
1029cd093f1dSJed Brown 
1030cd093f1dSJed Brown   PetscFunctionBegin;
1031854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1032cd093f1dSJed Brown   ci[0] = 0;
1033cd093f1dSJed Brown 
1034cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1035cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1036cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1037580bdb30SBarry Smith   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1038cd093f1dSJed Brown 
1039cd093f1dSJed Brown   /* Determine ci and cj */
1040cd093f1dSJed Brown   for (i=0; i<am; i++) {
1041cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1042cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1043cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1044cd093f1dSJed Brown     /* Pack segrow */
1045cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1046cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1047cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
1048cd093f1dSJed Brown         PetscInt bcol = bj[k];
1049cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1050cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1051cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1052cd093f1dSJed Brown           *slot = bcol;
1053cd093f1dSJed Brown           seen[bcol] = 1;
1054cd093f1dSJed Brown           packlen++;
1055cd093f1dSJed Brown         }
1056cd093f1dSJed Brown       }
1057cd093f1dSJed Brown     }
1058cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1059cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1060cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1061cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1062cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1063cd093f1dSJed Brown   }
1064cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1065cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1066cd093f1dSJed Brown 
1067cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1068cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1069cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1070cd093f1dSJed Brown 
1071cd093f1dSJed Brown   /* put together the new symbolic matrix */
1072cd093f1dSJed Brown   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
107333d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
107402fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1075cd093f1dSJed Brown 
1076cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1077cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1078cd093f1dSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
1079cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1080cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1081cd093f1dSJed Brown   c->nonew   = 0;
1082cd093f1dSJed Brown 
1083df97dc6dSFande Kong   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1084cd093f1dSJed Brown 
1085cd093f1dSJed Brown   /* set MatInfo */
1086cd093f1dSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1087cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1088cd093f1dSJed Brown   c->maxnz                     = ci[am];
1089cd093f1dSJed Brown   c->nz                        = ci[am];
1090cd093f1dSJed Brown   (*C)->info.mallocs           = ndouble;
1091cd093f1dSJed Brown   (*C)->info.fill_ratio_given  = fill;
1092cd093f1dSJed Brown   (*C)->info.fill_ratio_needed = afill;
1093cd093f1dSJed Brown 
1094cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1095cd093f1dSJed Brown   if (ci[am]) {
109657622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
109757622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1098cd093f1dSJed Brown   } else {
1099cd093f1dSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
1100cd093f1dSJed Brown   }
1101cd093f1dSJed Brown #endif
1102cd093f1dSJed Brown   PetscFunctionReturn(0);
1103cd093f1dSJed Brown }
1104cd093f1dSJed Brown 
1105d2853540SHong Zhang /* This routine is not used. Should be removed! */
11066fc122caSHong Zhang PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
11075df89d91SHong Zhang {
1108bc011b1eSHong Zhang   PetscErrorCode ierr;
1109bc011b1eSHong Zhang 
1110bc011b1eSHong Zhang   PetscFunctionBegin;
1111bc011b1eSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
11123ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
11136fc122caSHong Zhang     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
11143ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1115bc011b1eSHong Zhang   }
11163ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
11176fc122caSHong Zhang   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
11183ff4c91cSHong Zhang   ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
1119bc011b1eSHong Zhang   PetscFunctionReturn(0);
1120bc011b1eSHong Zhang }
1121bc011b1eSHong Zhang 
11222128a86cSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
11232128a86cSHong Zhang {
11242128a86cSHong Zhang   PetscErrorCode      ierr;
11254c7df5ccSHong Zhang   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
112640192850SHong Zhang   Mat_MatMatTransMult *abt=a->abt;
11272128a86cSHong Zhang 
11282128a86cSHong Zhang   PetscFunctionBegin;
112940192850SHong Zhang   ierr = (abt->destroy)(A);CHKERRQ(ierr);
113040192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
113140192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
113240192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
113340192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
11342128a86cSHong Zhang   PetscFunctionReturn(0);
11352128a86cSHong Zhang }
11362128a86cSHong Zhang 
11376fc122caSHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1138bc011b1eSHong Zhang {
11395df89d91SHong Zhang   PetscErrorCode      ierr;
114081d82fe4SHong Zhang   Mat                 Bt;
114181d82fe4SHong Zhang   PetscInt            *bti,*btj;
114240192850SHong Zhang   Mat_MatMatTransMult *abt;
11434c7df5ccSHong Zhang   Mat_SeqAIJ          *c;
1144d2853540SHong Zhang 
114581d82fe4SHong Zhang   PetscFunctionBegin;
114681d82fe4SHong Zhang   /* create symbolic Bt */
114781d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
11480298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
114933d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
115004b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
115181d82fe4SHong Zhang 
115281d82fe4SHong Zhang   /* get symbolic C=A*Bt */
115381d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
115481d82fe4SHong Zhang 
11552128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1156b00a9115SJed Brown   ierr   = PetscNew(&abt);CHKERRQ(ierr);
11574c7df5ccSHong Zhang   c      = (Mat_SeqAIJ*)(*C)->data;
115840192850SHong Zhang   c->abt = abt;
11592128a86cSHong Zhang 
116040192850SHong Zhang   abt->usecoloring = PETSC_FALSE;
116140192850SHong Zhang   abt->destroy     = (*C)->ops->destroy;
11622128a86cSHong Zhang   (*C)->ops->destroy     = MatDestroy_SeqAIJ_MatMatMultTrans;
11632128a86cSHong Zhang 
1164c5929fdfSBarry Smith   ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr);
116540192850SHong Zhang   if (abt->usecoloring) {
1166b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1167b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1168335efc43SPeter Brune     MatColoring          coloring;
11692128a86cSHong Zhang     ISColoring           iscoloring;
11702128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
11714d478ae7SHong Zhang     Mat_SeqAIJ           *c=(Mat_SeqAIJ*)(*C)->data;
11724d478ae7SHong Zhang     /* inode causes memory problem, don't know why */
11734d478ae7SHong Zhang     if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
1174e8354b3eSHong Zhang 
1175335efc43SPeter Brune     ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr);
1176335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1177335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1178335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1179335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1180335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1181b9af6bddSHong Zhang     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
11822205254eSKarl Rupp 
118340192850SHong Zhang     abt->matcoloring = matcoloring;
11842205254eSKarl Rupp 
11852128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
11862128a86cSHong Zhang 
11872128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
11882128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
11892128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11902128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
11910298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
11922205254eSKarl Rupp 
11932128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
119440192850SHong Zhang     abt->Bt_den   = Bt_dense;
11952128a86cSHong Zhang 
11962128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
11972128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11982128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
11990298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
12002205254eSKarl Rupp 
12012128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
120240192850SHong Zhang     abt->ABt_den  = C_dense;
1203f94ccd6cSHong Zhang 
1204f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12051ce71dffSSatish Balay     {
1206f94ccd6cSHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
1207c40ebe3bSHong Zhang       ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
12081ce71dffSSatish Balay     }
1209f94ccd6cSHong Zhang #endif
12102128a86cSHong Zhang   }
1211e99be685SHong Zhang   /* clean up */
1212e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1213e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12145df89d91SHong Zhang   PetscFunctionReturn(0);
12155df89d91SHong Zhang }
12165df89d91SHong Zhang 
12176fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12185df89d91SHong Zhang {
12195df89d91SHong Zhang   PetscErrorCode      ierr;
12205df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1221e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
122281d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12235df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1224aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
122540192850SHong Zhang   Mat_MatMatTransMult *abt = c->abt;
12265df89d91SHong Zhang 
12275df89d91SHong Zhang   PetscFunctionBegin;
122858ed3ceaSHong Zhang   /* clear old values in C */
122958ed3ceaSHong Zhang   if (!c->a) {
1230580bdb30SBarry Smith     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
123158ed3ceaSHong Zhang     c->a      = ca;
123258ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
123358ed3ceaSHong Zhang   } else {
123458ed3ceaSHong Zhang     ca =  c->a;
1235580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
123658ed3ceaSHong Zhang   }
123758ed3ceaSHong Zhang 
123840192850SHong Zhang   if (abt->usecoloring) {
123940192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
124040192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1241c8db22f6SHong Zhang 
1242b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
124340192850SHong Zhang     Bt_dense = abt->Bt_den;
1244b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1245c8db22f6SHong Zhang 
1246c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
12472128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1248c8db22f6SHong Zhang 
12492128a86cSHong Zhang     /* Recover C from C_dense */
1250b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1251c8db22f6SHong Zhang     PetscFunctionReturn(0);
1252c8db22f6SHong Zhang   }
1253c8db22f6SHong Zhang 
12541fa1209cSHong Zhang   for (i=0; i<cm; i++) {
125581d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
125681d82fe4SHong Zhang     acol = aj + ai[i];
12576973769bSHong Zhang     aval = aa + ai[i];
12581fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
12591fa1209cSHong Zhang     ccol = cj + ci[i];
12606973769bSHong Zhang     cval = ca + ci[i];
12611fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
126281d82fe4SHong Zhang       brow = ccol[j];
126381d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
126481d82fe4SHong Zhang       bcol = bj + bi[brow];
12656973769bSHong Zhang       bval = ba + bi[brow];
12666973769bSHong Zhang 
126781d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
126881d82fe4SHong Zhang       nexta = 0; nextb = 0;
126981d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
12707b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
127181d82fe4SHong Zhang         if (nexta == anzi) break;
12727b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
127381d82fe4SHong Zhang         if (nextb == bnzj) break;
127481d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
12756973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
127681d82fe4SHong Zhang           nexta++; nextb++;
127781d82fe4SHong Zhang           flops += 2;
12781fa1209cSHong Zhang         }
12791fa1209cSHong Zhang       }
128081d82fe4SHong Zhang     }
128181d82fe4SHong Zhang   }
128281d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
128381d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
128481d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
12855df89d91SHong Zhang   PetscFunctionReturn(0);
12865df89d91SHong Zhang }
12875df89d91SHong Zhang 
12886d373c3eSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
12896d373c3eSHong Zhang {
12906d373c3eSHong Zhang   PetscErrorCode      ierr;
12916d373c3eSHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
12926d373c3eSHong Zhang   Mat_MatTransMatMult *atb = a->atb;
12936d373c3eSHong Zhang 
12946d373c3eSHong Zhang   PetscFunctionBegin;
12956473ade5SStefano Zampini   if (atb) {
12966d373c3eSHong Zhang     ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
12976473ade5SStefano Zampini     ierr = (*atb->destroy)(A);CHKERRQ(ierr);
12986473ade5SStefano Zampini   }
12996d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
13006d373c3eSHong Zhang   PetscFunctionReturn(0);
13016d373c3eSHong Zhang }
13026d373c3eSHong Zhang 
13030adebc6cSBarry Smith PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
13040adebc6cSBarry Smith {
13055df89d91SHong Zhang   PetscErrorCode      ierr;
13066d373c3eSHong Zhang   const char          *algTypes[2] = {"matmatmult","outerproduct"};
13076d373c3eSHong Zhang   PetscInt            alg=0; /* set default algorithm */
13086d373c3eSHong Zhang   Mat                 At;
13096d373c3eSHong Zhang   Mat_MatTransMatMult *atb;
13106d373c3eSHong Zhang   Mat_SeqAIJ          *c;
13115df89d91SHong Zhang 
13125df89d91SHong Zhang   PetscFunctionBegin;
13135df89d91SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
1314715a5346SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
13156d373c3eSHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr);
13166d373c3eSHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
13176d373c3eSHong Zhang 
13186d373c3eSHong Zhang     switch (alg) {
13196d373c3eSHong Zhang     case 1:
132075648e8dSHong Zhang       ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
13216d373c3eSHong Zhang       break;
13226d373c3eSHong Zhang     default:
13236d373c3eSHong Zhang       ierr = PetscNew(&atb);CHKERRQ(ierr);
13246d373c3eSHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
13256d373c3eSHong Zhang       ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
13266d373c3eSHong Zhang 
1327618cf492SHong Zhang       c                  = (Mat_SeqAIJ*)(*C)->data;
13286d373c3eSHong Zhang       c->atb             = atb;
13296d373c3eSHong Zhang       atb->At            = At;
13306d373c3eSHong Zhang       atb->destroy       = (*C)->ops->destroy;
13316d373c3eSHong Zhang       (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
13326d373c3eSHong Zhang 
13336d373c3eSHong Zhang       break;
13345df89d91SHong Zhang     }
13356d373c3eSHong Zhang   }
13366d373c3eSHong Zhang   if (alg) {
13376d373c3eSHong Zhang     ierr = (*(*C)->ops->mattransposemultnumeric)(A,B,*C);CHKERRQ(ierr);
13386d373c3eSHong Zhang   } else if (!alg && scall == MAT_REUSE_MATRIX) {
13396d373c3eSHong Zhang     c   = (Mat_SeqAIJ*)(*C)->data;
13406d373c3eSHong Zhang     atb = c->atb;
13416d373c3eSHong Zhang     At  = atb->At;
13426d373c3eSHong Zhang     ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
13436d373c3eSHong Zhang     ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);CHKERRQ(ierr);
13446d373c3eSHong Zhang   }
13455df89d91SHong Zhang   PetscFunctionReturn(0);
13465df89d91SHong Zhang }
13475df89d91SHong Zhang 
134875648e8dSHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
13495df89d91SHong Zhang {
1350bc011b1eSHong Zhang   PetscErrorCode ierr;
1351bc011b1eSHong Zhang   Mat            At;
135238baddfdSBarry Smith   PetscInt       *ati,*atj;
1353bc011b1eSHong Zhang 
1354bc011b1eSHong Zhang   PetscFunctionBegin;
1355bc011b1eSHong Zhang   /* create symbolic At */
1356bc011b1eSHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13570298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
135833d57670SJed Brown   ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
135904b858e0SBarry Smith   ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1360bc011b1eSHong Zhang 
1361bc011b1eSHong Zhang   /* get symbolic C=At*B */
1362bc011b1eSHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1363bc011b1eSHong Zhang 
1364bc011b1eSHong Zhang   /* clean up */
13656bf464f9SBarry Smith   ierr = MatDestroy(&At);CHKERRQ(ierr);
1366bc011b1eSHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13676d373c3eSHong Zhang 
13686d373c3eSHong Zhang   (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ;
1369bc011b1eSHong Zhang   PetscFunctionReturn(0);
1370bc011b1eSHong Zhang }
1371bc011b1eSHong Zhang 
137275648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1373bc011b1eSHong Zhang {
1374bc011b1eSHong Zhang   PetscErrorCode ierr;
13750fbc74f4SHong Zhang   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1376d0f46423SBarry Smith   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1377d0f46423SBarry Smith   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1378d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1379aa1aec99SHong Zhang   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1380bc011b1eSHong Zhang 
1381bc011b1eSHong Zhang   PetscFunctionBegin;
1382aa1aec99SHong Zhang   if (!c->a) {
1383580bdb30SBarry Smith     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
13842205254eSKarl Rupp 
1385aa1aec99SHong Zhang     c->a      = ca;
1386aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1387aa1aec99SHong Zhang   } else {
1388aa1aec99SHong Zhang     ca   = c->a;
1389580bdb30SBarry Smith     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1390aa1aec99SHong Zhang   }
1391bc011b1eSHong Zhang 
1392bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1393bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1394bc011b1eSHong Zhang     bj   = b->j + bi[i];
1395bc011b1eSHong Zhang     ba   = b->a + bi[i];
1396bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1397bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1398bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1399bc011b1eSHong Zhang       nextb = 0;
14000fbc74f4SHong Zhang       crow  = *aj++;
1401bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1402bc011b1eSHong Zhang       caj   = ca + ci[crow];
1403bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1404bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14050fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14060fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1407bc011b1eSHong Zhang           nextb++;
1408bc011b1eSHong Zhang         }
1409bc011b1eSHong Zhang       }
1410bc011b1eSHong Zhang       flops += 2*bnzi;
14110fbc74f4SHong Zhang       aa++;
1412bc011b1eSHong Zhang     }
1413bc011b1eSHong Zhang   }
1414bc011b1eSHong Zhang 
1415bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1416bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1417bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1418bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1419bc011b1eSHong Zhang   PetscFunctionReturn(0);
1420bc011b1eSHong Zhang }
14219123193aSHong Zhang 
1422150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
14239123193aSHong Zhang {
14249123193aSHong Zhang   PetscErrorCode ierr;
14259123193aSHong Zhang 
14269123193aSHong Zhang   PetscFunctionBegin;
14279123193aSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
14283ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14299123193aSHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
14303ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14319123193aSHong Zhang   }
14323ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14339123193aSHong Zhang   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
14344614e006SHong Zhang   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14359123193aSHong Zhang   PetscFunctionReturn(0);
14369123193aSHong Zhang }
1437edd81eecSMatthew Knepley 
14389123193aSHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
14399123193aSHong Zhang {
14409123193aSHong Zhang   PetscErrorCode ierr;
14419123193aSHong Zhang 
14429123193aSHong Zhang   PetscFunctionBegin;
14435a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14442205254eSKarl Rupp 
1445d73949e8SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14469123193aSHong Zhang   PetscFunctionReturn(0);
14479123193aSHong Zhang }
14489123193aSHong Zhang 
14499123193aSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
14509123193aSHong Zhang {
1451f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1452612438f1SStefano Zampini   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
14539123193aSHong Zhang   PetscErrorCode    ierr;
1454daf57211SHong Zhang   PetscScalar       *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1455daf57211SHong Zhang   const PetscScalar *aa,*b1,*b2,*b3,*b4;
1456daf57211SHong Zhang   const PetscInt    *aj;
1457612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1458daf57211SHong Zhang   PetscInt          am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
14599123193aSHong Zhang 
14609123193aSHong Zhang   PetscFunctionBegin;
1461f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1462612438f1SStefano Zampini   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1463e32f2f54SBarry Smith   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1464e32f2f54SBarry Smith   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1465612438f1SStefano Zampini   b = bd->v;
14668c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1467f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1468730858b9SHong Zhang   c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1469f32d5d43SBarry Smith   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1470f32d5d43SBarry Smith     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1471f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1472f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1473f32d5d43SBarry Smith       aj = a->j + a->i[i];
1474f32d5d43SBarry Smith       aa = a->a + a->i[i];
1475f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1476730858b9SHong Zhang         aatmp = aa[j]; ajtmp = aj[j];
1477730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1478730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1479730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1480730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
14819123193aSHong Zhang       }
1482730858b9SHong Zhang       c1[i] = r1;
1483730858b9SHong Zhang       c2[i] = r2;
1484730858b9SHong Zhang       c3[i] = r3;
1485730858b9SHong Zhang       c4[i] = r4;
1486f32d5d43SBarry Smith     }
1487730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1488730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1489f32d5d43SBarry Smith   }
1490f32d5d43SBarry Smith   for (; col<cn; col++) {   /* over extra columns of C */
1491f32d5d43SBarry Smith     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1492f32d5d43SBarry Smith       r1 = 0.0;
1493f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1494f32d5d43SBarry Smith       aj = a->j + a->i[i];
1495f32d5d43SBarry Smith       aa = a->a + a->i[i];
1496f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1497730858b9SHong Zhang         r1 += aa[j]*b1[aj[j]];
1498f32d5d43SBarry Smith       }
1499730858b9SHong Zhang       c1[i] = r1;
1500f32d5d43SBarry Smith     }
1501f32d5d43SBarry Smith     b1 += bm;
1502730858b9SHong Zhang     c1 += am;
1503f32d5d43SBarry Smith   }
1504dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
15058c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1506f32d5d43SBarry Smith   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1507f32d5d43SBarry Smith   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1508f32d5d43SBarry Smith   PetscFunctionReturn(0);
1509f32d5d43SBarry Smith }
1510f32d5d43SBarry Smith 
1511f32d5d43SBarry Smith /*
15124324174eSBarry Smith    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1513f32d5d43SBarry Smith */
1514f32d5d43SBarry Smith PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1515f32d5d43SBarry Smith {
1516f32d5d43SBarry Smith   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
151790f5ea3eSStefano Zampini   Mat_SeqDense   *bd = (Mat_SeqDense*)B->data;
1518f32d5d43SBarry Smith   PetscErrorCode ierr;
1519dd6ea824SBarry Smith   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1520dd6ea824SBarry Smith   MatScalar      *aa;
152190f5ea3eSStefano Zampini   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
15224324174eSBarry Smith   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1523f32d5d43SBarry Smith 
1524f32d5d43SBarry Smith   PetscFunctionBegin;
1525f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
152690f5ea3eSStefano Zampini   b = bd->v;
15278c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1528f32d5d43SBarry Smith   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
15294324174eSBarry Smith 
15304324174eSBarry Smith   if (a->compressedrow.use) { /* use compressed row format */
15314324174eSBarry Smith     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
15324324174eSBarry Smith       colam = col*am;
15334324174eSBarry Smith       arm   = a->compressedrow.nrows;
15344324174eSBarry Smith       ii    = a->compressedrow.i;
15354324174eSBarry Smith       ridx  = a->compressedrow.rindex;
15364324174eSBarry Smith       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
15374324174eSBarry Smith         r1 = r2 = r3 = r4 = 0.0;
15384324174eSBarry Smith         n  = ii[i+1] - ii[i];
15394324174eSBarry Smith         aj = a->j + ii[i];
15404324174eSBarry Smith         aa = a->a + ii[i];
15414324174eSBarry Smith         for (j=0; j<n; j++) {
15424324174eSBarry Smith           r1 += (*aa)*b1[*aj];
15434324174eSBarry Smith           r2 += (*aa)*b2[*aj];
15444324174eSBarry Smith           r3 += (*aa)*b3[*aj];
15454324174eSBarry Smith           r4 += (*aa++)*b4[*aj++];
15464324174eSBarry Smith         }
15474324174eSBarry Smith         c[colam       + ridx[i]] += r1;
15484324174eSBarry Smith         c[colam + am  + ridx[i]] += r2;
15494324174eSBarry Smith         c[colam + am2 + ridx[i]] += r3;
15504324174eSBarry Smith         c[colam + am3 + ridx[i]] += r4;
15514324174eSBarry Smith       }
15524324174eSBarry Smith       b1 += bm4;
15534324174eSBarry Smith       b2 += bm4;
15544324174eSBarry Smith       b3 += bm4;
15554324174eSBarry Smith       b4 += bm4;
15564324174eSBarry Smith     }
15574324174eSBarry Smith     for (; col<cn; col++) {     /* over extra columns of C */
15584324174eSBarry Smith       colam = col*am;
15594324174eSBarry Smith       arm   = a->compressedrow.nrows;
15604324174eSBarry Smith       ii    = a->compressedrow.i;
15614324174eSBarry Smith       ridx  = a->compressedrow.rindex;
15624324174eSBarry Smith       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
15634324174eSBarry Smith         r1 = 0.0;
15644324174eSBarry Smith         n  = ii[i+1] - ii[i];
15654324174eSBarry Smith         aj = a->j + ii[i];
15664324174eSBarry Smith         aa = a->a + ii[i];
15674324174eSBarry Smith 
15684324174eSBarry Smith         for (j=0; j<n; j++) {
15694324174eSBarry Smith           r1 += (*aa++)*b1[*aj++];
15704324174eSBarry Smith         }
1571a2ea699eSBarry Smith         c[colam + ridx[i]] += r1;
15724324174eSBarry Smith       }
15734324174eSBarry Smith       b1 += bm;
15744324174eSBarry Smith     }
15754324174eSBarry Smith   } else {
1576f32d5d43SBarry Smith     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1577f32d5d43SBarry Smith       colam = col*am;
1578f32d5d43SBarry Smith       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1579f32d5d43SBarry Smith         r1 = r2 = r3 = r4 = 0.0;
1580f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1581f32d5d43SBarry Smith         aj = a->j + a->i[i];
1582f32d5d43SBarry Smith         aa = a->a + a->i[i];
1583f32d5d43SBarry Smith         for (j=0; j<n; j++) {
1584f32d5d43SBarry Smith           r1 += (*aa)*b1[*aj];
1585f32d5d43SBarry Smith           r2 += (*aa)*b2[*aj];
1586f32d5d43SBarry Smith           r3 += (*aa)*b3[*aj];
1587f32d5d43SBarry Smith           r4 += (*aa++)*b4[*aj++];
1588f32d5d43SBarry Smith         }
1589f32d5d43SBarry Smith         c[colam + i]       += r1;
1590f32d5d43SBarry Smith         c[colam + am + i]  += r2;
1591f32d5d43SBarry Smith         c[colam + am2 + i] += r3;
1592f32d5d43SBarry Smith         c[colam + am3 + i] += r4;
1593f32d5d43SBarry Smith       }
1594f32d5d43SBarry Smith       b1 += bm4;
1595f32d5d43SBarry Smith       b2 += bm4;
1596f32d5d43SBarry Smith       b3 += bm4;
1597f32d5d43SBarry Smith       b4 += bm4;
1598f32d5d43SBarry Smith     }
1599f32d5d43SBarry Smith     for (; col<cn; col++) {     /* over extra columns of C */
1600a2ea699eSBarry Smith       colam = col*am;
1601f32d5d43SBarry Smith       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1602f32d5d43SBarry Smith         r1 = 0.0;
1603f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1604f32d5d43SBarry Smith         aj = a->j + a->i[i];
1605f32d5d43SBarry Smith         aa = a->a + a->i[i];
1606f32d5d43SBarry Smith 
1607f32d5d43SBarry Smith         for (j=0; j<n; j++) {
1608f32d5d43SBarry Smith           r1 += (*aa++)*b1[*aj++];
1609f32d5d43SBarry Smith         }
1610a2ea699eSBarry Smith         c[colam + i] += r1;
1611f32d5d43SBarry Smith       }
1612f32d5d43SBarry Smith       b1 += bm;
1613f32d5d43SBarry Smith     }
16144324174eSBarry Smith   }
1615dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
16168c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
16179123193aSHong Zhang   PetscFunctionReturn(0);
16189123193aSHong Zhang }
1619b1683b59SHong Zhang 
1620b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1621c8db22f6SHong Zhang {
1622c8db22f6SHong Zhang   PetscErrorCode ierr;
16232f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
16242f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
16252f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
16262f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
16272f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
16282f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1629c8db22f6SHong Zhang 
1630c8db22f6SHong Zhang   PetscFunctionBegin;
16312f5f1f90SHong Zhang   btval_den=btdense->v;
1632580bdb30SBarry Smith   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
16332f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
16342f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
16352f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1636d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
16372f5f1f90SHong Zhang       btcol = bj + bi[col];
16382f5f1f90SHong Zhang       btval = ba + bi[col];
16392f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1640d2853540SHong Zhang       for (j=0; j<anz; j++) {
16412f5f1f90SHong Zhang         brow            = btcol[j];
16422f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1643c8db22f6SHong Zhang       }
1644c8db22f6SHong Zhang     }
16452f5f1f90SHong Zhang     btval_den += m;
1646c8db22f6SHong Zhang   }
1647c8db22f6SHong Zhang   PetscFunctionReturn(0);
1648c8db22f6SHong Zhang }
1649c8db22f6SHong Zhang 
1650b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
16518972f759SHong Zhang {
16528972f759SHong Zhang   PetscErrorCode    ierr;
1653b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
16541683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
16551683a169SBarry Smith   PetscScalar       *ca=csp->a;
1656f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1657e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1658077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1659077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
16608972f759SHong Zhang 
16618972f759SHong Zhang   PetscFunctionBegin;
16621683a169SBarry Smith   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1663f99a636bSHong Zhang 
1664077f23c2SHong Zhang   if (brows > 0) {
1665077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1666980ae229SHong Zhang     lstart = matcoloring->lstart;
1667580bdb30SBarry Smith     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1668eeb4fd02SHong Zhang 
1669077f23c2SHong Zhang     row_end = brows;
1670eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1671077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1672077f23c2SHong Zhang       ca_den_ptr = ca_den;
1673980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1674eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1675eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1676eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1677eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1678eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1679eeb4fd02SHong Zhang             lstart[k] = l;
1680eeb4fd02SHong Zhang             break;
1681eeb4fd02SHong Zhang           } else {
1682077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1683eeb4fd02SHong Zhang           }
1684eeb4fd02SHong Zhang         }
1685077f23c2SHong Zhang         ca_den_ptr += m;
1686eeb4fd02SHong Zhang       }
1687077f23c2SHong Zhang       row_end += brows;
1688eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1689eeb4fd02SHong Zhang     }
1690077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1691077f23c2SHong Zhang     ca_den_ptr = ca_den;
1692077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1693077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1694077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1695077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1696077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1697077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1698077f23c2SHong Zhang       }
1699077f23c2SHong Zhang       ca_den_ptr += m;
1700077f23c2SHong Zhang     }
1701f99a636bSHong Zhang   }
1702f99a636bSHong Zhang 
17031683a169SBarry Smith   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1704f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1705077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1706f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1707e88777f2SHong Zhang   } else {
1708077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1709e88777f2SHong Zhang   }
1710f99a636bSHong Zhang #endif
17118972f759SHong Zhang   PetscFunctionReturn(0);
17128972f759SHong Zhang }
17138972f759SHong Zhang 
1714b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1715b1683b59SHong Zhang {
1716b1683b59SHong Zhang   PetscErrorCode ierr;
1717e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
17181a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1719b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1720b1683b59SHong Zhang   IS             *isa;
1721b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1722e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1723e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1724e88777f2SHong Zhang   PetscBool      flg;
1725b1683b59SHong Zhang 
1726b1683b59SHong Zhang   PetscFunctionBegin;
1727*071fcb05SBarry Smith   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1728e99be685SHong Zhang 
17297ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1730e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1731b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1732e88777f2SHong Zhang   c->N      = Nbs;
1733e88777f2SHong Zhang   c->m      = c->M;
1734b1683b59SHong Zhang   c->rstart = 0;
1735e88777f2SHong Zhang   c->brows  = 100;
1736b1683b59SHong Zhang 
1737b1683b59SHong Zhang   c->ncolors = nis;
1738dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1739854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1740854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1741e88777f2SHong Zhang 
1742e88777f2SHong Zhang   brows = c->brows;
1743c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1744e88777f2SHong Zhang   if (flg) c->brows = brows;
1745eeb4fd02SHong Zhang   if (brows > 0) {
1746854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1747e88777f2SHong Zhang   }
17482205254eSKarl Rupp 
1749d2853540SHong Zhang   colorforrow[0] = 0;
1750d2853540SHong Zhang   rows_i         = rows;
1751f99a636bSHong Zhang   den2sp_i       = den2sp;
1752b1683b59SHong Zhang 
1753854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1754854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
17552205254eSKarl Rupp 
1756d2853540SHong Zhang   colorforcol[0] = 0;
1757d2853540SHong Zhang   columns_i      = columns;
1758d2853540SHong Zhang 
1759077f23c2SHong Zhang   /* get column-wise storage of mat */
1760077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1761b1683b59SHong Zhang 
1762b28d80bdSHong Zhang   cm   = c->m;
1763854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1764854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1765f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1766b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1767b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
17682205254eSKarl Rupp 
1769b1683b59SHong Zhang     c->ncolumns[i] = n;
1770b1683b59SHong Zhang     if (n) {
1771580bdb30SBarry Smith       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1772b1683b59SHong Zhang     }
1773d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1774d2853540SHong Zhang     columns_i       += n;
1775b1683b59SHong Zhang 
1776b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1777580bdb30SBarry Smith     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1778e99be685SHong Zhang 
1779b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1780b1683b59SHong Zhang       col     = is[j];
1781d2853540SHong Zhang       row_idx = cj + ci[col];
1782b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1783b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1784e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1785d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1786b1683b59SHong Zhang       }
1787b1683b59SHong Zhang     }
1788b1683b59SHong Zhang     /* count the number of hits */
1789b1683b59SHong Zhang     nrows = 0;
1790e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1791b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1792b1683b59SHong Zhang     }
1793b1683b59SHong Zhang     c->nrows[i]      = nrows;
1794d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1795d2853540SHong Zhang 
1796b1683b59SHong Zhang     nrows = 0;
1797b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1798b1683b59SHong Zhang       if (rowhit[j]) {
1799d2853540SHong Zhang         rows_i[nrows]   = j;
180012b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1801b1683b59SHong Zhang         nrows++;
1802b1683b59SHong Zhang       }
1803b1683b59SHong Zhang     }
1804e88777f2SHong Zhang     den2sp_i += nrows;
1805077f23c2SHong Zhang 
1806b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1807f99a636bSHong Zhang     rows_i += nrows;
1808b1683b59SHong Zhang   }
18090298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1810b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1811*071fcb05SBarry Smith   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1812d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1813b28d80bdSHong Zhang 
1814d2853540SHong Zhang   c->colorforrow = colorforrow;
1815d2853540SHong Zhang   c->rows        = rows;
1816f99a636bSHong Zhang   c->den2sp      = den2sp;
1817d2853540SHong Zhang   c->colorforcol = colorforcol;
1818d2853540SHong Zhang   c->columns     = columns;
18192205254eSKarl Rupp 
1820f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1821b1683b59SHong Zhang   PetscFunctionReturn(0);
1822b1683b59SHong Zhang }
1823d013fc79Sandi selinger 
1824df97dc6dSFande Kong /* The combine method has done the symbolic and numeric in the first phase, and so we just return here */
1825df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,Mat C)
1826df97dc6dSFande Kong {
1827df97dc6dSFande Kong   PetscFunctionBegin;
1828df97dc6dSFande Kong   /* Empty function */
1829df97dc6dSFande Kong   PetscFunctionReturn(0);
1830df97dc6dSFande Kong }
1831df97dc6dSFande Kong 
183273b67375Sandi selinger /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */
1833d013fc79Sandi selinger PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C)
1834d013fc79Sandi selinger {
1835d013fc79Sandi selinger   PetscErrorCode     ierr;
1836d013fc79Sandi selinger   PetscLogDouble     flops=0.0;
1837d013fc79Sandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
18382869b61bSandi selinger   const PetscInt     *ai=a->i,*bi=b->i;
1839d013fc79Sandi selinger   PetscInt           *ci,*cj,*cj_i;
1840d013fc79Sandi selinger   PetscScalar        *ca,*ca_i;
18412869b61bSandi selinger   PetscInt           b_maxmemrow,c_maxmem,a_col;
1842d013fc79Sandi selinger   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1843d013fc79Sandi selinger   PetscInt           i,k,ndouble=0;
1844d013fc79Sandi selinger   PetscReal          afill;
1845d013fc79Sandi selinger   PetscScalar        *c_row_val_dense;
1846d013fc79Sandi selinger   PetscBool          *c_row_idx_flags;
1847d013fc79Sandi selinger   PetscInt           *aj_i=a->j;
1848d013fc79Sandi selinger   PetscScalar        *aa_i=a->a;
1849d013fc79Sandi selinger 
1850d013fc79Sandi selinger   PetscFunctionBegin;
18512869b61bSandi selinger 
18522869b61bSandi selinger   /* Step 1: Determine upper bounds on memory for C and allocate memory */
18532869b61bSandi selinger   /* This should be enough for almost all matrices. If still more memory is needed, it is reallocated later. */
18542869b61bSandi selinger   c_maxmem    = 8*(ai[am]+bi[bm]);
18552869b61bSandi selinger   b_maxmemrow = PetscMin(bi[bm],bn);
1856d013fc79Sandi selinger   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1857580bdb30SBarry Smith   ierr  = PetscCalloc1(bn,&c_row_val_dense);CHKERRQ(ierr);
1858580bdb30SBarry Smith   ierr  = PetscCalloc1(bn,&c_row_idx_flags);CHKERRQ(ierr);
1859d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
1860d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&ca);CHKERRQ(ierr);
1861d013fc79Sandi selinger   ca_i  = ca;
1862d013fc79Sandi selinger   cj_i  = cj;
1863d013fc79Sandi selinger   ci[0] = 0;
1864d013fc79Sandi selinger   for (i=0; i<am; i++) {
1865d013fc79Sandi selinger     /* Step 2: Initialize the dense row vector for C  */
1866d013fc79Sandi selinger     const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1867d013fc79Sandi selinger     PetscInt       cnzi = 0;
1868d013fc79Sandi selinger     PetscInt       *bj_i;
1869d013fc79Sandi selinger     PetscScalar    *ba_i;
18702869b61bSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory
18712869b61bSandi selinger        Usually, there is enough memory in the first place, so this is not executed. */
18722869b61bSandi selinger     while (ci[i] + b_maxmemrow > c_maxmem) {
18732869b61bSandi selinger       c_maxmem *= 2;
18742869b61bSandi selinger       ndouble++;
1875928bb9adSStefano Zampini       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
1876928bb9adSStefano Zampini       ierr = PetscRealloc(sizeof(PetscScalar)*c_maxmem,&ca);CHKERRQ(ierr);
18772869b61bSandi selinger     }
1878d013fc79Sandi selinger 
1879d013fc79Sandi selinger     /* Step 3: Do the numerical calculations */
1880d013fc79Sandi selinger     for (a_col=0; a_col<anzi; a_col++) {          /* iterate over all non zero values in a row of A */
1881d013fc79Sandi selinger       PetscInt       a_col_index = aj_i[a_col];
1882d013fc79Sandi selinger       const PetscInt bnzi        = bi[a_col_index+1] - bi[a_col_index];
1883d013fc79Sandi selinger       flops += 2*bnzi;
1884d013fc79Sandi selinger       bj_i   = b->j + bi[a_col_index];   /* points to the current row in bj */
1885d013fc79Sandi selinger       ba_i   = b->a + bi[a_col_index];   /* points to the current row in ba */
1886d013fc79Sandi selinger       for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */
1887d013fc79Sandi selinger         if (c_row_idx_flags[bj_i[k]] == PETSC_FALSE) {
18882869b61bSandi selinger           cj_i[cnzi++]             = bj_i[k];
1889d013fc79Sandi selinger           c_row_idx_flags[bj_i[k]] = PETSC_TRUE;
1890d013fc79Sandi selinger         }
1891d013fc79Sandi selinger         c_row_val_dense[bj_i[k]] += aa_i[a_col] * ba_i[k];
1892d013fc79Sandi selinger       }
1893d013fc79Sandi selinger     }
1894d013fc79Sandi selinger 
1895d013fc79Sandi selinger     /* Sort array */
18963353a62bSKarl Rupp     ierr = PetscSortInt(cnzi,cj_i);CHKERRQ(ierr);
1897d013fc79Sandi selinger     /* Step 4 */
1898d013fc79Sandi selinger     for (k=0; k<cnzi; k++) {
1899d013fc79Sandi selinger       ca_i[k]                  = c_row_val_dense[cj_i[k]];
1900d013fc79Sandi selinger       c_row_val_dense[cj_i[k]] = 0.;
1901d013fc79Sandi selinger       c_row_idx_flags[cj_i[k]] = PETSC_FALSE;
1902d013fc79Sandi selinger     }
1903d013fc79Sandi selinger     /* terminate current row */
1904d013fc79Sandi selinger     aa_i   += anzi;
1905d013fc79Sandi selinger     aj_i   += anzi;
1906d013fc79Sandi selinger     ca_i   += cnzi;
1907d013fc79Sandi selinger     cj_i   += cnzi;
1908d013fc79Sandi selinger     ci[i+1] = ci[i] + cnzi;
1909d013fc79Sandi selinger     flops  += cnzi;
1910d013fc79Sandi selinger   }
1911d013fc79Sandi selinger 
1912d013fc79Sandi selinger   /* Step 5 */
1913d013fc79Sandi selinger   /* Create the new matrix */
1914d013fc79Sandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
1915d013fc79Sandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
191602fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1917d013fc79Sandi selinger 
1918d013fc79Sandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1919d013fc79Sandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1920d013fc79Sandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
1921d013fc79Sandi selinger   c->a       = ca;
1922d013fc79Sandi selinger   c->free_a  = PETSC_TRUE;
1923d013fc79Sandi selinger   c->free_ij = PETSC_TRUE;
1924d013fc79Sandi selinger   c->nonew   = 0;
1925d013fc79Sandi selinger 
1926d013fc79Sandi selinger   /* set MatInfo */
1927d013fc79Sandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1928d013fc79Sandi selinger   if (afill < 1.0) afill = 1.0;
1929d013fc79Sandi selinger   c->maxnz                     = ci[am];
1930d013fc79Sandi selinger   c->nz                        = ci[am];
1931d013fc79Sandi selinger   (*C)->info.mallocs           = ndouble;
1932d013fc79Sandi selinger   (*C)->info.fill_ratio_given  = fill;
1933d013fc79Sandi selinger   (*C)->info.fill_ratio_needed = afill;
1934df97dc6dSFande Kong   (*C)->ops->matmultnumeric    = MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined;
1935d013fc79Sandi selinger 
193673b67375Sandi selinger   ierr = PetscFree(c_row_val_dense);CHKERRQ(ierr);
193773b67375Sandi selinger   ierr = PetscFree(c_row_idx_flags);CHKERRQ(ierr);
1938d013fc79Sandi selinger 
1939d013fc79Sandi selinger   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1940d013fc79Sandi selinger   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1941d013fc79Sandi selinger   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1942d013fc79Sandi selinger   PetscFunctionReturn(0);
1943d013fc79Sandi selinger }
1944