xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 715a534632cc6029c3649e8dc2fa74cc65e81325)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith  /*
32499ec78SHong Zhang    Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith            C = A * B
5d50806bdSBarry Smith  */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown  #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown  #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown  #include <petscbt.h>
10af0996ceSBarry Smith  #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith  #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
1358cf0668SJed Brown  static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*);
14cd093f1dSJed Brown 
155e5acdf2Sstefano_zampini  #if defined(PETSC_HAVE_HYPRE)
165e5acdf2Sstefano_zampini  PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
175e5acdf2Sstefano_zampini  #endif
185e5acdf2Sstefano_zampini 
19150d2497SBarry Smith  PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
2038baddfdSBarry Smith  {
21dfbe8321SBarry Smith    PetscErrorCode ierr;
225e5acdf2Sstefano_zampini  #if !defined(PETSC_HAVE_HYPRE)
23d7ed1a4dSandi selinger    const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"};
24d013fc79Sandi selinger    PetscInt       nalg = 8;
25d7ed1a4dSandi selinger  #else
26d7ed1a4dSandi selinger    const char     *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"};
27d7ed1a4dSandi selinger    PetscInt       nalg = 9;
285e5acdf2Sstefano_zampini  #endif
296540a9cdSHong Zhang    PetscInt       alg = 0; /* set default algorithm */
30d013fc79Sandi selinger    PetscBool      combined = PETSC_FALSE;  /* Indicates whether the symbolic stage already computed the numerical values. */
315c66b693SKris Buschelman 
325c66b693SKris Buschelman    PetscFunctionBegin;
3326be0446SHong Zhang    if (scall == MAT_INITIAL_MATRIX) {
34*715a5346SHong Zhang      ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
355e5acdf2Sstefano_zampini      ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr);
36d8bbc50fSBarry Smith      ierr = PetscOptionsEnd();CHKERRQ(ierr);
373ff4c91cSHong Zhang      ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
386540a9cdSHong Zhang      switch (alg) {
396540a9cdSHong Zhang      case 1:
40aacf854cSBarry Smith        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
416540a9cdSHong Zhang        break;
426540a9cdSHong Zhang      case 2:
436540a9cdSHong Zhang        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
446540a9cdSHong Zhang        break;
456540a9cdSHong Zhang      case 3:
460ced3a2bSJed Brown        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
476540a9cdSHong Zhang        break;
486540a9cdSHong Zhang      case 4:
498a07c6f1SJed Brown        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
506540a9cdSHong Zhang        break;
516540a9cdSHong Zhang      case 5:
5258cf0668SJed Brown        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
536540a9cdSHong Zhang        break;
545e5acdf2Sstefano_zampini      case 6:
55d013fc79Sandi selinger        ierr = MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);CHKERRQ(ierr);
56d013fc79Sandi selinger        combined = PETSC_TRUE;
57d013fc79Sandi selinger        break;
58d013fc79Sandi selinger     case 7:
59d7ed1a4dSandi selinger        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
60d7ed1a4dSandi selinger        break;
61d7ed1a4dSandi selinger  #if defined(PETSC_HAVE_HYPRE)
62d7ed1a4dSandi selinger      case 8:
635e5acdf2Sstefano_zampini        ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
645e5acdf2Sstefano_zampini        break;
655e5acdf2Sstefano_zampini  #endif
666540a9cdSHong Zhang      default:
6726be0446SHong Zhang        ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
686540a9cdSHong Zhang       break;
6925023028SHong Zhang      }
703ff4c91cSHong Zhang      ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
7126be0446SHong Zhang    }
725c913ed7SHong Zhang 
733ff4c91cSHong Zhang    ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
74d013fc79Sandi selinger    if (!combined) {
7501e47db4SHong Zhang      ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
76d013fc79Sandi selinger    }
773ff4c91cSHong Zhang    ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
785c66b693SKris Buschelman    PetscFunctionReturn(0);
795c66b693SKris Buschelman  }
801c24bd37SHong Zhang 
8158cf0668SJed Brown  static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
82b561aa9dSHong Zhang  {
83b561aa9dSHong Zhang    PetscErrorCode     ierr;
84b561aa9dSHong Zhang    Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
85971236abSHong Zhang    PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
86c1ba5575SJed Brown    PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
87b561aa9dSHong Zhang    PetscReal          afill;
88eca6b86aSHong Zhang    PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
89eca6b86aSHong Zhang    PetscTable         ta;
90fb908031SHong Zhang    PetscBT            lnkbt;
910298fd71SBarry Smith    PetscFreeSpaceList free_space=NULL,current_space=NULL;
92b561aa9dSHong Zhang 
93b561aa9dSHong Zhang    PetscFunctionBegin;
94bd958071SHong Zhang    /* Get ci and cj */
95bd958071SHong Zhang    /*---------------*/
96fb908031SHong Zhang    /* Allocate ci array, arrays for fill computation and */
97fb908031SHong Zhang    /* free space for accumulating nonzero column info */
98854ce69bSBarry Smith    ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
99fb908031SHong Zhang    ci[0] = 0;
100fb908031SHong Zhang 
101fb908031SHong Zhang    /* create and initialize a linked list */
102c373ccc6SHong Zhang    ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
103c373ccc6SHong Zhang    MatRowMergeMax_SeqAIJ(b,bm,ta);
104eca6b86aSHong Zhang    ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
105eca6b86aSHong Zhang    ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
106eca6b86aSHong Zhang 
107eca6b86aSHong Zhang    ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
108fb908031SHong Zhang 
109fb908031SHong Zhang    /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
110f91af8c7SBarry Smith    ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
1112205254eSKarl Rupp 
112fb908031SHong Zhang    current_space = free_space;
113fb908031SHong Zhang 
114fb908031SHong Zhang    /* Determine ci and cj */
115fb908031SHong Zhang    for (i=0; i<am; i++) {
116fb908031SHong Zhang      anzi = ai[i+1] - ai[i];
117fb908031SHong Zhang      aj   = a->j + ai[i];
118fb908031SHong Zhang      for (j=0; j<anzi; j++) {
119fb908031SHong Zhang        brow = aj[j];
120fb908031SHong Zhang        bnzj = bi[brow+1] - bi[brow];
121fb908031SHong Zhang        bj   = b->j + bi[brow];
122fb908031SHong Zhang        /* add non-zero cols of B into the sorted linked list lnk */
123fb908031SHong Zhang        ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
124fb908031SHong Zhang      }
125fb908031SHong Zhang      cnzi = lnk[0];
126fb908031SHong Zhang 
127fb908031SHong Zhang      /* If free space is not available, make more free space */
128fb908031SHong Zhang      /* Double the amount of total space in the list */
129fb908031SHong Zhang      if (current_space->local_remaining<cnzi) {
130f91af8c7SBarry Smith        ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
131fb908031SHong Zhang        ndouble++;
132fb908031SHong Zhang      }
133fb908031SHong Zhang 
134fb908031SHong Zhang      /* Copy data into free space, then initialize lnk */
135fb908031SHong Zhang      ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1362205254eSKarl Rupp 
137fb908031SHong Zhang      current_space->array           += cnzi;
138fb908031SHong Zhang      current_space->local_used      += cnzi;
139fb908031SHong Zhang      current_space->local_remaining -= cnzi;
1402205254eSKarl Rupp 
141fb908031SHong Zhang      ci[i+1] = ci[i] + cnzi;
142fb908031SHong Zhang    }
143fb908031SHong Zhang 
144fb908031SHong Zhang    /* Column indices are in the list of free space */
145fb908031SHong Zhang    /* Allocate space for cj, initialize cj, and */
146fb908031SHong Zhang    /* destroy list of free space and other temporary array(s) */
147854ce69bSBarry Smith    ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
148fb908031SHong Zhang    ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
149fb908031SHong Zhang    ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
150b561aa9dSHong Zhang 
15126be0446SHong Zhang    /* put together the new symbolic matrix */
152ce94432eSBarry Smith    ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
15333d57670SJed Brown    ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
15402fe1965SBarry Smith    ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
15558c24d83SHong Zhang 
15658c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
15758c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
15858c24d83SHong Zhang   c                         = (Mat_SeqAIJ*)((*C)->data);
159aa1aec99SHong Zhang   c->free_a                 = PETSC_FALSE;
160e6b907acSBarry Smith   c->free_ij                = PETSC_TRUE;
16158c24d83SHong Zhang   c->nonew                  = 0;
162002e8affSHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
1630b7e3e3dSHong Zhang 
1647212da7cSHong Zhang   /* set MatInfo */
1657212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
166f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
1677212da7cSHong Zhang   c->maxnz                     = ci[am];
1687212da7cSHong Zhang   c->nz                        = ci[am];
169fb908031SHong Zhang   (*C)->info.mallocs           = ndouble;
1707212da7cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
1717212da7cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
1727212da7cSHong Zhang 
1737212da7cSHong Zhang #if defined(PETSC_USE_INFO)
1747212da7cSHong Zhang   if (ci[am]) {
17557622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
17657622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
177f2b054eeSHong Zhang   } else {
178f2b054eeSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
179be0fcf8dSHong Zhang   }
180f2b054eeSHong Zhang #endif
18158c24d83SHong Zhang   PetscFunctionReturn(0);
18258c24d83SHong Zhang }
183d50806bdSBarry Smith 
184dfbe8321SBarry Smith PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
185d50806bdSBarry Smith {
186dfbe8321SBarry Smith   PetscErrorCode ierr;
187d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
188d50806bdSBarry Smith   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
189d50806bdSBarry Smith   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
190d50806bdSBarry Smith   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
19138baddfdSBarry Smith   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
192c58ca2e3SHong Zhang   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
193519eb980SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
194aa1aec99SHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
195aa1aec99SHong Zhang   PetscScalar    *ab_dense;
196d50806bdSBarry Smith 
197d50806bdSBarry Smith   PetscFunctionBegin;
198aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
199854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
200aa1aec99SHong Zhang     c->a      = ca;
201aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
202aa1aec99SHong Zhang   } else {
203aa1aec99SHong Zhang     ca        = c->a;
204d90d86d0SMatthew G. Knepley   }
205d90d86d0SMatthew G. Knepley   if (!c->matmult_abdense) {
2061795a4d1SJed Brown     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
207d90d86d0SMatthew G. Knepley     c->matmult_abdense = ab_dense;
208d90d86d0SMatthew G. Knepley   } else {
209aa1aec99SHong Zhang     ab_dense = c->matmult_abdense;
210aa1aec99SHong Zhang   }
211aa1aec99SHong Zhang 
212c124e916SHong Zhang   /* clean old values in C */
213c124e916SHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
214d50806bdSBarry Smith   /* Traverse A row-wise. */
215d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
216d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
217d50806bdSBarry Smith   for (i=0; i<am; i++) {
218d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
219d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
220519eb980SHong Zhang       brow = aj[j];
221d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
222d50806bdSBarry Smith       bjj  = bj + bi[brow];
223d50806bdSBarry Smith       baj  = ba + bi[brow];
224519eb980SHong Zhang       /* perform dense axpy */
22536ec6d2dSHong Zhang       valtmp = aa[j];
226519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
22736ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
228519eb980SHong Zhang       }
229519eb980SHong Zhang       flops += 2*bnzi;
230519eb980SHong Zhang     }
231c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
232c58ca2e3SHong Zhang 
233c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
234519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
235519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
236519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
237519eb980SHong Zhang     }
238519eb980SHong Zhang     flops += cnzi;
239519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
240519eb980SHong Zhang   }
241c58ca2e3SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
242c58ca2e3SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
243c58ca2e3SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
244c58ca2e3SHong Zhang   PetscFunctionReturn(0);
245c58ca2e3SHong Zhang }
246c58ca2e3SHong Zhang 
24725023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
248c58ca2e3SHong Zhang {
249c58ca2e3SHong Zhang   PetscErrorCode ierr;
250c58ca2e3SHong Zhang   PetscLogDouble flops=0.0;
251c58ca2e3SHong Zhang   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
252c58ca2e3SHong Zhang   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
253c58ca2e3SHong Zhang   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
254c58ca2e3SHong Zhang   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
255c58ca2e3SHong Zhang   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
256c58ca2e3SHong Zhang   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
25736ec6d2dSHong Zhang   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
258c58ca2e3SHong Zhang   PetscInt       nextb;
259c58ca2e3SHong Zhang 
260c58ca2e3SHong Zhang   PetscFunctionBegin;
261cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
262cf742fccSHong Zhang     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
263cf742fccSHong Zhang     c->a      = ca;
264cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
265cf742fccSHong Zhang   }
266cf742fccSHong Zhang 
267c58ca2e3SHong Zhang   /* clean old values in C */
268c58ca2e3SHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
269c58ca2e3SHong Zhang   /* Traverse A row-wise. */
270c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
271c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
272519eb980SHong Zhang   for (i=0; i<am; i++) {
273519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
274519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
275519eb980SHong Zhang     for (j=0; j<anzi; j++) {
276519eb980SHong Zhang       brow = aj[j];
277519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
278519eb980SHong Zhang       bjj  = bj + bi[brow];
279519eb980SHong Zhang       baj  = ba + bi[brow];
280519eb980SHong Zhang       /* perform sparse axpy */
28136ec6d2dSHong Zhang       valtmp = aa[j];
282c124e916SHong Zhang       nextb  = 0;
283c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
284c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
28536ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
286c124e916SHong Zhang         }
287d50806bdSBarry Smith       }
288d50806bdSBarry Smith       flops += 2*bnzi;
289d50806bdSBarry Smith     }
290519eb980SHong Zhang     aj += anzi; aa += anzi;
291519eb980SHong Zhang     cj += cnzi; ca += cnzi;
292d50806bdSBarry Smith   }
293c58ca2e3SHong Zhang 
294716bacf3SKris Buschelman   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
295716bacf3SKris Buschelman   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
296d50806bdSBarry Smith   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
297d50806bdSBarry Smith   PetscFunctionReturn(0);
298d50806bdSBarry Smith }
299bc011b1eSHong Zhang 
3003c50cad2SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
30125296bd5SBarry Smith {
30225296bd5SBarry Smith   PetscErrorCode     ierr;
30325296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
30425296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3053c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
30625296bd5SBarry Smith   MatScalar          *ca;
30725296bd5SBarry Smith   PetscReal          afill;
308eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
309eca6b86aSHong Zhang   PetscTable         ta;
3100298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
31125296bd5SBarry Smith 
31225296bd5SBarry Smith   PetscFunctionBegin;
3133c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3143c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
3153c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
316854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
3173c50cad2SHong Zhang   ci[0] = 0;
3183c50cad2SHong Zhang 
3193c50cad2SHong Zhang   /* create and initialize a linked list */
320c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
321c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
322eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
323eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
324eca6b86aSHong Zhang 
325eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
3263c50cad2SHong Zhang 
3273c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
328f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
3293c50cad2SHong Zhang   current_space = free_space;
3303c50cad2SHong Zhang 
3313c50cad2SHong Zhang   /* Determine ci and cj */
3323c50cad2SHong Zhang   for (i=0; i<am; i++) {
3333c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
3343c50cad2SHong Zhang     aj   = a->j + ai[i];
3353c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
3363c50cad2SHong Zhang       brow = aj[j];
3373c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
3383c50cad2SHong Zhang       bj   = b->j + bi[brow];
3393c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
3403c50cad2SHong Zhang       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
3413c50cad2SHong Zhang     }
3423c50cad2SHong Zhang     cnzi = lnk[1];
3433c50cad2SHong Zhang 
3443c50cad2SHong Zhang     /* If free space is not available, make more free space */
3453c50cad2SHong Zhang     /* Double the amount of total space in the list */
3463c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
347f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
3483c50cad2SHong Zhang       ndouble++;
3493c50cad2SHong Zhang     }
3503c50cad2SHong Zhang 
3513c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
3523c50cad2SHong Zhang     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
3532205254eSKarl Rupp 
3543c50cad2SHong Zhang     current_space->array           += cnzi;
3553c50cad2SHong Zhang     current_space->local_used      += cnzi;
3563c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
3572205254eSKarl Rupp 
3583c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
3593c50cad2SHong Zhang   }
3603c50cad2SHong Zhang 
3613c50cad2SHong Zhang   /* Column indices are in the list of free space */
3623c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
3633c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
364854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
3653c50cad2SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
3663c50cad2SHong Zhang   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
36725296bd5SBarry Smith 
36825296bd5SBarry Smith   /* Allocate space for ca */
369854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
37025296bd5SBarry Smith   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
37125296bd5SBarry Smith 
37225296bd5SBarry Smith   /* put together the new symbolic matrix */
373ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
37433d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
37502fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
37625296bd5SBarry Smith 
37725296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
37825296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
37925296bd5SBarry Smith   c          = (Mat_SeqAIJ*)((*C)->data);
38025296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
38125296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
38225296bd5SBarry Smith   c->nonew   = 0;
3832205254eSKarl Rupp 
38425296bd5SBarry Smith   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
38525296bd5SBarry Smith 
38625296bd5SBarry Smith   /* set MatInfo */
38725296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
38825296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
38925296bd5SBarry Smith   c->maxnz                     = ci[am];
39025296bd5SBarry Smith   c->nz                        = ci[am];
3913c50cad2SHong Zhang   (*C)->info.mallocs           = ndouble;
39225296bd5SBarry Smith   (*C)->info.fill_ratio_given  = fill;
39325296bd5SBarry Smith   (*C)->info.fill_ratio_needed = afill;
39425296bd5SBarry Smith 
39525296bd5SBarry Smith #if defined(PETSC_USE_INFO)
39625296bd5SBarry Smith   if (ci[am]) {
39757622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
39857622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
39925296bd5SBarry Smith   } else {
40025296bd5SBarry Smith     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
40125296bd5SBarry Smith   }
40225296bd5SBarry Smith #endif
40325296bd5SBarry Smith   PetscFunctionReturn(0);
40425296bd5SBarry Smith }
40525296bd5SBarry Smith 
40625296bd5SBarry Smith 
40725023028SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
408e9e4536cSHong Zhang {
409e9e4536cSHong Zhang   PetscErrorCode     ierr;
410e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
411bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
41225c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
413e9e4536cSHong Zhang   MatScalar          *ca;
414e9e4536cSHong Zhang   PetscReal          afill;
415eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
416eca6b86aSHong Zhang   PetscTable         ta;
4170298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
418e9e4536cSHong Zhang 
419e9e4536cSHong Zhang   PetscFunctionBegin;
420bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
421bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
422bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
423854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
424bd958071SHong Zhang   ci[0] = 0;
425bd958071SHong Zhang 
426bd958071SHong Zhang   /* create and initialize a linked list */
427c373ccc6SHong Zhang   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
428c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
429eca6b86aSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
430eca6b86aSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
431eca6b86aSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
432bd958071SHong Zhang 
433bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
434f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
435bd958071SHong Zhang   current_space = free_space;
436bd958071SHong Zhang 
437bd958071SHong Zhang   /* Determine ci and cj */
438bd958071SHong Zhang   for (i=0; i<am; i++) {
439bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
440bd958071SHong Zhang     aj   = a->j + ai[i];
441bd958071SHong Zhang     for (j=0; j<anzi; j++) {
442bd958071SHong Zhang       brow = aj[j];
443bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
444bd958071SHong Zhang       bj   = b->j + bi[brow];
445bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
446bd958071SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
447bd958071SHong Zhang     }
448bd958071SHong Zhang     cnzi = lnk[0];
449bd958071SHong Zhang 
450bd958071SHong Zhang     /* If free space is not available, make more free space */
451bd958071SHong Zhang     /* Double the amount of total space in the list */
452bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
453f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
454bd958071SHong Zhang       ndouble++;
455bd958071SHong Zhang     }
456bd958071SHong Zhang 
457bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
458bd958071SHong Zhang     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
4592205254eSKarl Rupp 
460bd958071SHong Zhang     current_space->array           += cnzi;
461bd958071SHong Zhang     current_space->local_used      += cnzi;
462bd958071SHong Zhang     current_space->local_remaining -= cnzi;
4632205254eSKarl Rupp 
464bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
465bd958071SHong Zhang   }
466bd958071SHong Zhang 
467bd958071SHong Zhang   /* Column indices are in the list of free space */
468bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
469bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
470854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
471bd958071SHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
472bd958071SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
473e9e4536cSHong Zhang 
474e9e4536cSHong Zhang   /* Allocate space for ca */
475bd958071SHong Zhang   /*-----------------------*/
476854ce69bSBarry Smith   ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
477e9e4536cSHong Zhang   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
478e9e4536cSHong Zhang 
479e9e4536cSHong Zhang   /* put together the new symbolic matrix */
480ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
48133d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
48202fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
483e9e4536cSHong Zhang 
484e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
485e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
486e9e4536cSHong Zhang   c          = (Mat_SeqAIJ*)((*C)->data);
487e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
488e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
489e9e4536cSHong Zhang   c->nonew   = 0;
4902205254eSKarl Rupp 
49125023028SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
492e9e4536cSHong Zhang 
493e9e4536cSHong Zhang   /* set MatInfo */
494e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
495e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
496e9e4536cSHong Zhang   c->maxnz                     = ci[am];
497e9e4536cSHong Zhang   c->nz                        = ci[am];
498bd958071SHong Zhang   (*C)->info.mallocs           = ndouble;
499e9e4536cSHong Zhang   (*C)->info.fill_ratio_given  = fill;
500e9e4536cSHong Zhang   (*C)->info.fill_ratio_needed = afill;
501e9e4536cSHong Zhang 
502e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
503e9e4536cSHong Zhang   if (ci[am]) {
50457622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
50557622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
506e9e4536cSHong Zhang   } else {
507e9e4536cSHong Zhang     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
508e9e4536cSHong Zhang   }
509e9e4536cSHong Zhang #endif
510e9e4536cSHong Zhang   PetscFunctionReturn(0);
511e9e4536cSHong Zhang }
512e9e4536cSHong Zhang 
5130ced3a2bSJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
5140ced3a2bSJed Brown {
5150ced3a2bSJed Brown   PetscErrorCode     ierr;
5160ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
5170ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
5180ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
5190ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
5200ced3a2bSJed Brown   PetscReal          afill;
5210ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
5220298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
5230ced3a2bSJed Brown   PetscHeap          h;
5240ced3a2bSJed Brown 
5250ced3a2bSJed Brown   PetscFunctionBegin;
526cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
5270ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
5280ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
529854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
5300ced3a2bSJed Brown   ci[0] = 0;
5310ced3a2bSJed Brown 
5320ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
533f91af8c7SBarry Smith   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
5340ced3a2bSJed Brown   current_space = free_space;
5350ced3a2bSJed Brown 
5360ced3a2bSJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
537785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
5380ced3a2bSJed Brown 
5390ced3a2bSJed Brown   /* Determine ci and cj */
5400ced3a2bSJed Brown   for (i=0; i<am; i++) {
5410ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
5420ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
5430ced3a2bSJed Brown     ci[i+1] = ci[i];
5440ced3a2bSJed Brown     /* Populate the min heap */
5450ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
5460ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
5470ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
5480ced3a2bSJed Brown         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
5490ced3a2bSJed Brown       }
5500ced3a2bSJed Brown     }
5510ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
5520ced3a2bSJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5530ced3a2bSJed Brown     while (j >= 0) {
5540ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
555f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
5560ced3a2bSJed Brown         ndouble++;
5570ced3a2bSJed Brown       }
5580ced3a2bSJed Brown       *(current_space->array++) = col;
5590ced3a2bSJed Brown       current_space->local_used++;
5600ced3a2bSJed Brown       current_space->local_remaining--;
5610ced3a2bSJed Brown       ci[i+1]++;
5620ced3a2bSJed Brown 
5630ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
5640ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
5650ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
5660ced3a2bSJed Brown         PetscInt j2,col2;
5670ced3a2bSJed Brown         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
5680ced3a2bSJed Brown         if (col2 != col) break;
5690ced3a2bSJed Brown         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
5700ced3a2bSJed Brown         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
5710ced3a2bSJed Brown       }
5720ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
5730ced3a2bSJed Brown       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
5740ced3a2bSJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
5750ced3a2bSJed Brown     }
5760ced3a2bSJed Brown   }
5770ced3a2bSJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
5780ced3a2bSJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
5790ced3a2bSJed Brown 
5800ced3a2bSJed Brown   /* Column indices are in the list of free space */
5810ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
5820ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
583785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
5840ced3a2bSJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
5850ced3a2bSJed Brown 
5860ced3a2bSJed Brown   /* put together the new symbolic matrix */
587ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
58833d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
58902fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
5900ced3a2bSJed Brown 
5910ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5920ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5930ced3a2bSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
5940ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
5950ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
5960ced3a2bSJed Brown   c->nonew   = 0;
59726fbe8dcSKarl Rupp 
59889d95c1aSJed Brown   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
5990ced3a2bSJed Brown 
6000ced3a2bSJed Brown   /* set MatInfo */
6010ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6020ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6030ced3a2bSJed Brown   c->maxnz                     = ci[am];
6040ced3a2bSJed Brown   c->nz                        = ci[am];
6050ced3a2bSJed Brown   (*C)->info.mallocs           = ndouble;
6060ced3a2bSJed Brown   (*C)->info.fill_ratio_given  = fill;
6070ced3a2bSJed Brown   (*C)->info.fill_ratio_needed = afill;
6080ced3a2bSJed Brown 
6090ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6100ced3a2bSJed Brown   if (ci[am]) {
61157622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
61257622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
6130ced3a2bSJed Brown   } else {
6140ced3a2bSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
6150ced3a2bSJed Brown   }
6160ced3a2bSJed Brown #endif
6170ced3a2bSJed Brown   PetscFunctionReturn(0);
6180ced3a2bSJed Brown }
619e9e4536cSHong Zhang 
6208a07c6f1SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
6218a07c6f1SJed Brown {
6228a07c6f1SJed Brown   PetscErrorCode     ierr;
6238a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6248a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6258a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
6268a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6278a07c6f1SJed Brown   PetscReal          afill;
6288a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
6290298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6308a07c6f1SJed Brown   PetscHeap          h;
6318a07c6f1SJed Brown   PetscBT            bt;
6328a07c6f1SJed Brown 
6338a07c6f1SJed Brown   PetscFunctionBegin;
634cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
6358a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
6368a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
637854ce69bSBarry Smith   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
6388a07c6f1SJed Brown   ci[0] = 0;
6398a07c6f1SJed Brown 
6408a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
641f91af8c7SBarry Smith   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
6422205254eSKarl Rupp 
6438a07c6f1SJed Brown   current_space = free_space;
6448a07c6f1SJed Brown 
6458a07c6f1SJed Brown   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
646785e854fSJed Brown   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
6478a07c6f1SJed Brown   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
6488a07c6f1SJed Brown 
6498a07c6f1SJed Brown   /* Determine ci and cj */
6508a07c6f1SJed Brown   for (i=0; i<am; i++) {
6518a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6528a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6538a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
6548a07c6f1SJed Brown     ci[i+1] = ci[i];
6558a07c6f1SJed Brown     /* Populate the min heap */
6568a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
6578a07c6f1SJed Brown       PetscInt brow = acol[j];
6588a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
6598a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6608a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6618a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6628a07c6f1SJed Brown           bb[j]++;
6638a07c6f1SJed Brown           break;
6648a07c6f1SJed Brown         }
6658a07c6f1SJed Brown       }
6668a07c6f1SJed Brown     }
6678a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
6688a07c6f1SJed Brown     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6698a07c6f1SJed Brown     while (j >= 0) {
6708a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6710298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
672f91af8c7SBarry Smith         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
6738a07c6f1SJed Brown         ndouble++;
6748a07c6f1SJed Brown       }
6758a07c6f1SJed Brown       *(current_space->array++) = col;
6768a07c6f1SJed Brown       current_space->local_used++;
6778a07c6f1SJed Brown       current_space->local_remaining--;
6788a07c6f1SJed Brown       ci[i+1]++;
6798a07c6f1SJed Brown 
6808a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
6818a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
6828a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
6838a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
6848a07c6f1SJed Brown           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
6858a07c6f1SJed Brown           bb[j]++;
6868a07c6f1SJed Brown           break;
6878a07c6f1SJed Brown         }
6888a07c6f1SJed Brown       }
6898a07c6f1SJed Brown       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
6908a07c6f1SJed Brown     }
6918a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
6928a07c6f1SJed Brown       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
6938a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
6948a07c6f1SJed Brown       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
6958a07c6f1SJed Brown     }
6968a07c6f1SJed Brown   }
6978a07c6f1SJed Brown   ierr = PetscFree(bb);CHKERRQ(ierr);
6988a07c6f1SJed Brown   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
6998a07c6f1SJed Brown   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
7008a07c6f1SJed Brown 
7018a07c6f1SJed Brown   /* Column indices are in the list of free space */
7028a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7038a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
704785e854fSJed Brown   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
7058a07c6f1SJed Brown   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
7068a07c6f1SJed Brown 
7078a07c6f1SJed Brown   /* put together the new symbolic matrix */
708ce94432eSBarry Smith   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
70933d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
71002fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
7118a07c6f1SJed Brown 
7128a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7138a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
7148a07c6f1SJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
7158a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7168a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7178a07c6f1SJed Brown   c->nonew   = 0;
71826fbe8dcSKarl Rupp 
71989d95c1aSJed Brown   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
7208a07c6f1SJed Brown 
7218a07c6f1SJed Brown   /* set MatInfo */
7228a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
7238a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7248a07c6f1SJed Brown   c->maxnz                     = ci[am];
7258a07c6f1SJed Brown   c->nz                        = ci[am];
7268a07c6f1SJed Brown   (*C)->info.mallocs           = ndouble;
7278a07c6f1SJed Brown   (*C)->info.fill_ratio_given  = fill;
7288a07c6f1SJed Brown   (*C)->info.fill_ratio_needed = afill;
7298a07c6f1SJed Brown 
7308a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7318a07c6f1SJed Brown   if (ci[am]) {
73257622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
73357622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
7348a07c6f1SJed Brown   } else {
7358a07c6f1SJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
7368a07c6f1SJed Brown   }
7378a07c6f1SJed Brown #endif
7388a07c6f1SJed Brown   PetscFunctionReturn(0);
7398a07c6f1SJed Brown }
7408a07c6f1SJed Brown 
741d7ed1a4dSandi selinger 
742d7ed1a4dSandi selinger PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C)
743d7ed1a4dSandi selinger {
744d7ed1a4dSandi selinger   PetscErrorCode     ierr;
745d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
746d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
747d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
748d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
749d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
750d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
751d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
752d7ed1a4dSandi selinger   PetscInt           window[8];
753d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
754d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
755d7ed1a4dSandi selinger   PetscReal          afill;
756f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
7577660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
758d7ed1a4dSandi selinger 
759d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
760d7ed1a4dSandi selinger              Because of the way virtual memory works,
761d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
762d7ed1a4dSandi selinger   PetscFunctionBegin;
763d7ed1a4dSandi selinger   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
764d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
765d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
766d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
767d7ed1a4dSandi selinger     a_rownnz = 0;
768d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
769d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
770d7ed1a4dSandi selinger       if (a_rownnz > bn) {
771d7ed1a4dSandi selinger         a_rownnz = bn;
772d7ed1a4dSandi selinger         break;
773d7ed1a4dSandi selinger       }
774d7ed1a4dSandi selinger     }
775d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
776d7ed1a4dSandi selinger   }
777d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
778d7ed1a4dSandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
779f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
780f83700f2Sandi selinger   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
781d7ed1a4dSandi selinger 
7827660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
7837660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
784d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
785d7ed1a4dSandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
786d7ed1a4dSandi selinger 
787d7ed1a4dSandi selinger   ci_nnz       = 0;
788d7ed1a4dSandi selinger   ci[0]        = 0;
789d7ed1a4dSandi selinger   worki_L1[0]  = 0;
790d7ed1a4dSandi selinger   worki_L2[0]  = 0;
791d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
792d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
793d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
794d7ed1a4dSandi selinger     rowsleft             = anzi;
795d7ed1a4dSandi selinger     inputcol_L1          = acol;
796d7ed1a4dSandi selinger     L2_nnz               = 0;
7977660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
7987660c4dbSandi selinger     worki_L2[1]          = 0;
79908fe1b3cSKarl Rupp     outputi_nnz          = 0;
800d7ed1a4dSandi selinger 
801d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
802d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
803d7ed1a4dSandi selinger       c_maxmem *= 2;
804d7ed1a4dSandi selinger       ndouble++;
805d7ed1a4dSandi selinger       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
806d7ed1a4dSandi selinger     }
807d7ed1a4dSandi selinger 
808d7ed1a4dSandi selinger     while (rowsleft) {
809d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
810d7ed1a4dSandi selinger       L1_nrows    = 0;
8117660c4dbSandi selinger       L1_nnz      = 0;
812d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
813d7ed1a4dSandi selinger       inputi      = bi;
814d7ed1a4dSandi selinger       inputj      = bj;
815d7ed1a4dSandi selinger 
816d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
817d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
818f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
819d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
820d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
821d7ed1a4dSandi selinger          window_min  = bn;                                                   \
8227660c4dbSandi selinger          outputi_nnz = 0;                                                    \
8237660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
824d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
825d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
826d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
827d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
828d7ed1a4dSandi selinger          }                                                                   \
829d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
830d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
831d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
832d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
833d7ed1a4dSandi selinger            window_min = bn;                                                  \
834d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
835d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
836d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
837d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
838d7ed1a4dSandi selinger              }                                                               \
839d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
840d7ed1a4dSandi selinger            }                                                                 \
841d7ed1a4dSandi selinger          }
842d7ed1a4dSandi selinger 
843d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
844d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
845d7ed1a4dSandi selinger       while (L1_rowsleft) {
8467660c4dbSandi selinger         outputi_nnz = 0;
8477660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
8487660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
8497660c4dbSandi selinger 
850d7ed1a4dSandi selinger         switch (L1_rowsleft) {
851d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
852d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
853d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
854d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
855d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
856d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
857d7ed1a4dSandi selinger                  break;
858d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
859d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
860d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
861d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
862d7ed1a4dSandi selinger                  break;
863d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
864d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
865d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
866d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
867d7ed1a4dSandi selinger                  break;
868d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
869d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
870d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
871d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
872d7ed1a4dSandi selinger                  break;
873d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
874d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
875d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
876d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
877d7ed1a4dSandi selinger                  break;
878d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
879d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
880d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
881d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
882d7ed1a4dSandi selinger                  break;
883d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
884d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
885d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
886d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
887d7ed1a4dSandi selinger                  break;
888d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
889d7ed1a4dSandi selinger                  inputcol    += 8;
890d7ed1a4dSandi selinger                  rowsleft    -= 8;
891d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
892d7ed1a4dSandi selinger                  break;
893d7ed1a4dSandi selinger         }
894d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
8957660c4dbSandi selinger         L1_nnz               += outputi_nnz;
8967660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
897d7ed1a4dSandi selinger       }
898d7ed1a4dSandi selinger 
899d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
900d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
901d7ed1a4dSandi selinger       if (anzi > 8) {
902d7ed1a4dSandi selinger         inputi      = worki_L1;
903d7ed1a4dSandi selinger         inputj      = workj_L1;
904d7ed1a4dSandi selinger         inputcol    = workcol;
905d7ed1a4dSandi selinger         outputi_nnz = 0;
906d7ed1a4dSandi selinger 
907d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
908d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
909d7ed1a4dSandi selinger 
910d7ed1a4dSandi selinger         switch (L1_nrows) {
911d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
912d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
913d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
914d7ed1a4dSandi selinger                  break;
915d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
916d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
917d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
918d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
919d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
920d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
921d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
922d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
923d7ed1a4dSandi selinger         }
924d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
925d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
926d7ed1a4dSandi selinger 
9277660c4dbSandi selinger         /************************ L E V E L  3 **********************/
9287660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
929d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
930d7ed1a4dSandi selinger           inputi      = worki_L2;
931d7ed1a4dSandi selinger           inputj      = workj_L2;
932d7ed1a4dSandi selinger           inputcol    = workcol;
933d7ed1a4dSandi selinger           outputi_nnz = 0;
934f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
935d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
936d7ed1a4dSandi selinger           switch (L2_nrows) {
937d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
938d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
939d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
940d7ed1a4dSandi selinger                    break;
941d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
942d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
943d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
944d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
945d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
946d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
947d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
948d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
949d7ed1a4dSandi selinger           }
950d7ed1a4dSandi selinger           L2_nrows    = 1;
9517660c4dbSandi selinger           L2_nnz      = outputi_nnz;
9527660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
9537660c4dbSandi selinger           /* Copy to workj_L2 */
954d7ed1a4dSandi selinger           if (rowsleft) {
9557660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
956d7ed1a4dSandi selinger           }
957d7ed1a4dSandi selinger         }
958d7ed1a4dSandi selinger       }
959d7ed1a4dSandi selinger     }  /* while (rowsleft) */
960d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
961d7ed1a4dSandi selinger 
962d7ed1a4dSandi selinger     /* terminate current row */
963d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
964d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
965d7ed1a4dSandi selinger   }
966d7ed1a4dSandi selinger 
967d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
968d7ed1a4dSandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
969d7ed1a4dSandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
970f83700f2Sandi selinger   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
971d7ed1a4dSandi selinger 
972d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
973d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
974d7ed1a4dSandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
975d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
976d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
977d7ed1a4dSandi selinger   c->nonew   = 0;
978d7ed1a4dSandi selinger 
979d7ed1a4dSandi selinger   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
980d7ed1a4dSandi selinger 
981d7ed1a4dSandi selinger   /* set MatInfo */
982d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
983d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
984d7ed1a4dSandi selinger   c->maxnz                     = ci[am];
985d7ed1a4dSandi selinger   c->nz                        = ci[am];
986d7ed1a4dSandi selinger   (*C)->info.mallocs           = ndouble;
987d7ed1a4dSandi selinger   (*C)->info.fill_ratio_given  = fill;
988d7ed1a4dSandi selinger   (*C)->info.fill_ratio_needed = afill;
989d7ed1a4dSandi selinger 
990d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
991d7ed1a4dSandi selinger   if (ci[am]) {
992d7ed1a4dSandi selinger     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
993d7ed1a4dSandi selinger     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
994d7ed1a4dSandi selinger   } else {
995d7ed1a4dSandi selinger     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
996d7ed1a4dSandi selinger   }
997d7ed1a4dSandi selinger #endif
998d7ed1a4dSandi selinger 
999d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
1000d7ed1a4dSandi selinger   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1001d7ed1a4dSandi selinger   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1002f83700f2Sandi selinger   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1003d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1004d7ed1a4dSandi selinger }
1005d7ed1a4dSandi selinger 
1006cd093f1dSJed Brown /* concatenate unique entries and then sort */
100758cf0668SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1008cd093f1dSJed Brown {
1009cd093f1dSJed Brown   PetscErrorCode     ierr;
1010cd093f1dSJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1011cd093f1dSJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1012cd093f1dSJed Brown   PetscInt           *ci,*cj;
1013cd093f1dSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1014cd093f1dSJed Brown   PetscReal          afill;
1015cd093f1dSJed Brown   PetscInt           i,j,ndouble = 0;
1016cd093f1dSJed Brown   PetscSegBuffer     seg,segrow;
1017cd093f1dSJed Brown   char               *seen;
1018cd093f1dSJed Brown 
1019cd093f1dSJed Brown   PetscFunctionBegin;
1020854ce69bSBarry Smith   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1021cd093f1dSJed Brown   ci[0] = 0;
1022cd093f1dSJed Brown 
1023cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1024cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1025cd093f1dSJed Brown   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1026785e854fSJed Brown   ierr = PetscMalloc1(bn,&seen);CHKERRQ(ierr);
1027cd093f1dSJed Brown   ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr);
1028cd093f1dSJed Brown 
1029cd093f1dSJed Brown   /* Determine ci and cj */
1030cd093f1dSJed Brown   for (i=0; i<am; i++) {
1031cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1032cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1033cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1034cd093f1dSJed Brown     /* Pack segrow */
1035cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1036cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1037cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
1038cd093f1dSJed Brown         PetscInt bcol = bj[k];
1039cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1040cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
1041cd093f1dSJed Brown           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1042cd093f1dSJed Brown           *slot = bcol;
1043cd093f1dSJed Brown           seen[bcol] = 1;
1044cd093f1dSJed Brown           packlen++;
1045cd093f1dSJed Brown         }
1046cd093f1dSJed Brown       }
1047cd093f1dSJed Brown     }
1048cd093f1dSJed Brown     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1049cd093f1dSJed Brown     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1050cd093f1dSJed Brown     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1051cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1052cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1053cd093f1dSJed Brown   }
1054cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1055cd093f1dSJed Brown   ierr = PetscFree(seen);CHKERRQ(ierr);
1056cd093f1dSJed Brown 
1057cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
1058cd093f1dSJed Brown   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1059cd093f1dSJed Brown   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1060cd093f1dSJed Brown 
1061cd093f1dSJed Brown   /* put together the new symbolic matrix */
1062cd093f1dSJed Brown   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
106333d57670SJed Brown   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
106402fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1065cd093f1dSJed Brown 
1066cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1067cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1068cd093f1dSJed Brown   c          = (Mat_SeqAIJ*)((*C)->data);
1069cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1070cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1071cd093f1dSJed Brown   c->nonew   = 0;
1072cd093f1dSJed Brown 
1073cd093f1dSJed Brown   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
1074cd093f1dSJed Brown 
1075cd093f1dSJed Brown   /* set MatInfo */
1076cd093f1dSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1077cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
1078cd093f1dSJed Brown   c->maxnz                     = ci[am];
1079cd093f1dSJed Brown   c->nz                        = ci[am];
1080cd093f1dSJed Brown   (*C)->info.mallocs           = ndouble;
1081cd093f1dSJed Brown   (*C)->info.fill_ratio_given  = fill;
1082cd093f1dSJed Brown   (*C)->info.fill_ratio_needed = afill;
1083cd093f1dSJed Brown 
1084cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1085cd093f1dSJed Brown   if (ci[am]) {
108657622a8eSBarry Smith     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
108757622a8eSBarry Smith     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1088cd093f1dSJed Brown   } else {
1089cd093f1dSJed Brown     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
1090cd093f1dSJed Brown   }
1091cd093f1dSJed Brown #endif
1092cd093f1dSJed Brown   PetscFunctionReturn(0);
1093cd093f1dSJed Brown }
1094cd093f1dSJed Brown 
1095d2853540SHong Zhang /* This routine is not used. Should be removed! */
10966fc122caSHong Zhang PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
10975df89d91SHong Zhang {
1098bc011b1eSHong Zhang   PetscErrorCode ierr;
1099bc011b1eSHong Zhang 
1100bc011b1eSHong Zhang   PetscFunctionBegin;
1101bc011b1eSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
11023ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
11036fc122caSHong Zhang     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
11043ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1105bc011b1eSHong Zhang   }
11063ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
11076fc122caSHong Zhang   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
11083ff4c91cSHong Zhang   ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
1109bc011b1eSHong Zhang   PetscFunctionReturn(0);
1110bc011b1eSHong Zhang }
1111bc011b1eSHong Zhang 
11122128a86cSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
11132128a86cSHong Zhang {
11142128a86cSHong Zhang   PetscErrorCode      ierr;
11154c7df5ccSHong Zhang   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
111640192850SHong Zhang   Mat_MatMatTransMult *abt=a->abt;
11172128a86cSHong Zhang 
11182128a86cSHong Zhang   PetscFunctionBegin;
111940192850SHong Zhang   ierr = (abt->destroy)(A);CHKERRQ(ierr);
112040192850SHong Zhang   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
112140192850SHong Zhang   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
112240192850SHong Zhang   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
112340192850SHong Zhang   ierr = PetscFree(abt);CHKERRQ(ierr);
11242128a86cSHong Zhang   PetscFunctionReturn(0);
11252128a86cSHong Zhang }
11262128a86cSHong Zhang 
11276fc122caSHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1128bc011b1eSHong Zhang {
11295df89d91SHong Zhang   PetscErrorCode      ierr;
113081d82fe4SHong Zhang   Mat                 Bt;
113181d82fe4SHong Zhang   PetscInt            *bti,*btj;
113240192850SHong Zhang   Mat_MatMatTransMult *abt;
11334c7df5ccSHong Zhang   Mat_SeqAIJ          *c;
1134d2853540SHong Zhang 
113581d82fe4SHong Zhang   PetscFunctionBegin;
113681d82fe4SHong Zhang   /* create symbolic Bt */
113781d82fe4SHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
11380298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
113933d57670SJed Brown   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
114004b858e0SBarry Smith   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
114181d82fe4SHong Zhang 
114281d82fe4SHong Zhang   /* get symbolic C=A*Bt */
114381d82fe4SHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
114481d82fe4SHong Zhang 
11452128a86cSHong Zhang   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1146b00a9115SJed Brown   ierr   = PetscNew(&abt);CHKERRQ(ierr);
11474c7df5ccSHong Zhang   c      = (Mat_SeqAIJ*)(*C)->data;
114840192850SHong Zhang   c->abt = abt;
11492128a86cSHong Zhang 
115040192850SHong Zhang   abt->usecoloring = PETSC_FALSE;
115140192850SHong Zhang   abt->destroy     = (*C)->ops->destroy;
11522128a86cSHong Zhang   (*C)->ops->destroy     = MatDestroy_SeqAIJ_MatMatMultTrans;
11532128a86cSHong Zhang 
1154c5929fdfSBarry Smith   ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr);
115540192850SHong Zhang   if (abt->usecoloring) {
1156b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1157b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1158335efc43SPeter Brune     MatColoring          coloring;
11592128a86cSHong Zhang     ISColoring           iscoloring;
11602128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
11614d478ae7SHong Zhang     Mat_SeqAIJ           *c=(Mat_SeqAIJ*)(*C)->data;
11624d478ae7SHong Zhang     /* inode causes memory problem, don't know why */
11634d478ae7SHong Zhang     if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
1164e8354b3eSHong Zhang 
1165335efc43SPeter Brune     ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr);
1166335efc43SPeter Brune     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1167335efc43SPeter Brune     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1168335efc43SPeter Brune     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1169335efc43SPeter Brune     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1170335efc43SPeter Brune     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1171b9af6bddSHong Zhang     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
11722205254eSKarl Rupp 
117340192850SHong Zhang     abt->matcoloring = matcoloring;
11742205254eSKarl Rupp 
11752128a86cSHong Zhang     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
11762128a86cSHong Zhang 
11772128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
11782128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
11792128a86cSHong Zhang     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11802128a86cSHong Zhang     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
11810298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
11822205254eSKarl Rupp 
11832128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
118440192850SHong Zhang     abt->Bt_den   = Bt_dense;
11852128a86cSHong Zhang 
11862128a86cSHong Zhang     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
11872128a86cSHong Zhang     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
11882128a86cSHong Zhang     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
11890298fd71SBarry Smith     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
11902205254eSKarl Rupp 
11912128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
119240192850SHong Zhang     abt->ABt_den  = C_dense;
1193f94ccd6cSHong Zhang 
1194f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
11951ce71dffSSatish Balay     {
1196f94ccd6cSHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
1197c40ebe3bSHong Zhang       ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
11981ce71dffSSatish Balay     }
1199f94ccd6cSHong Zhang #endif
12002128a86cSHong Zhang   }
1201e99be685SHong Zhang   /* clean up */
1202e99be685SHong Zhang   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1203e99be685SHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
12045df89d91SHong Zhang   PetscFunctionReturn(0);
12055df89d91SHong Zhang }
12065df89d91SHong Zhang 
12076fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12085df89d91SHong Zhang {
12095df89d91SHong Zhang   PetscErrorCode      ierr;
12105df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1211e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
121281d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12135df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1214aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
121540192850SHong Zhang   Mat_MatMatTransMult *abt = c->abt;
12165df89d91SHong Zhang 
12175df89d91SHong Zhang   PetscFunctionBegin;
121858ed3ceaSHong Zhang   /* clear old values in C */
121958ed3ceaSHong Zhang   if (!c->a) {
1220854ce69bSBarry Smith     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
122158ed3ceaSHong Zhang     c->a      = ca;
122258ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
122358ed3ceaSHong Zhang   } else {
122458ed3ceaSHong Zhang     ca =  c->a;
122558ed3ceaSHong Zhang   }
122658ed3ceaSHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
122758ed3ceaSHong Zhang 
122840192850SHong Zhang   if (abt->usecoloring) {
122940192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
123040192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1231c8db22f6SHong Zhang 
1232b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
123340192850SHong Zhang     Bt_dense = abt->Bt_den;
1234b9af6bddSHong Zhang     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1235c8db22f6SHong Zhang 
1236c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
12372128a86cSHong Zhang     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1238c8db22f6SHong Zhang 
12392128a86cSHong Zhang     /* Recover C from C_dense */
1240b9af6bddSHong Zhang     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1241c8db22f6SHong Zhang     PetscFunctionReturn(0);
1242c8db22f6SHong Zhang   }
1243c8db22f6SHong Zhang 
12441fa1209cSHong Zhang   for (i=0; i<cm; i++) {
124581d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
124681d82fe4SHong Zhang     acol = aj + ai[i];
12476973769bSHong Zhang     aval = aa + ai[i];
12481fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
12491fa1209cSHong Zhang     ccol = cj + ci[i];
12506973769bSHong Zhang     cval = ca + ci[i];
12511fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
125281d82fe4SHong Zhang       brow = ccol[j];
125381d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
125481d82fe4SHong Zhang       bcol = bj + bi[brow];
12556973769bSHong Zhang       bval = ba + bi[brow];
12566973769bSHong Zhang 
125781d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
125881d82fe4SHong Zhang       nexta = 0; nextb = 0;
125981d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
12607b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
126181d82fe4SHong Zhang         if (nexta == anzi) break;
12627b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
126381d82fe4SHong Zhang         if (nextb == bnzj) break;
126481d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
12656973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
126681d82fe4SHong Zhang           nexta++; nextb++;
126781d82fe4SHong Zhang           flops += 2;
12681fa1209cSHong Zhang         }
12691fa1209cSHong Zhang       }
127081d82fe4SHong Zhang     }
127181d82fe4SHong Zhang   }
127281d82fe4SHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
127381d82fe4SHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
127481d82fe4SHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
12755df89d91SHong Zhang   PetscFunctionReturn(0);
12765df89d91SHong Zhang }
12775df89d91SHong Zhang 
12786d373c3eSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
12796d373c3eSHong Zhang {
12806d373c3eSHong Zhang   PetscErrorCode      ierr;
12816d373c3eSHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
12826d373c3eSHong Zhang   Mat_MatTransMatMult *atb = a->atb;
12836d373c3eSHong Zhang 
12846d373c3eSHong Zhang   PetscFunctionBegin;
12856d373c3eSHong Zhang   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
12866d373c3eSHong Zhang   ierr = (atb->destroy)(A);CHKERRQ(ierr);
12876d373c3eSHong Zhang   ierr = PetscFree(atb);CHKERRQ(ierr);
12886d373c3eSHong Zhang   PetscFunctionReturn(0);
12896d373c3eSHong Zhang }
12906d373c3eSHong Zhang 
12910adebc6cSBarry Smith PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
12920adebc6cSBarry Smith {
12935df89d91SHong Zhang   PetscErrorCode      ierr;
12946d373c3eSHong Zhang   const char          *algTypes[2] = {"matmatmult","outerproduct"};
12956d373c3eSHong Zhang   PetscInt            alg=0; /* set default algorithm */
12966d373c3eSHong Zhang   Mat                 At;
12976d373c3eSHong Zhang   Mat_MatTransMatMult *atb;
12986d373c3eSHong Zhang   Mat_SeqAIJ          *c;
12995df89d91SHong Zhang 
13005df89d91SHong Zhang   PetscFunctionBegin;
13015df89d91SHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
1302*715a5346SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
13036d373c3eSHong Zhang     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr);
13046d373c3eSHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
13056d373c3eSHong Zhang 
13066d373c3eSHong Zhang     switch (alg) {
13076d373c3eSHong Zhang     case 1:
130875648e8dSHong Zhang       ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
13096d373c3eSHong Zhang       break;
13106d373c3eSHong Zhang     default:
13116d373c3eSHong Zhang       ierr = PetscNew(&atb);CHKERRQ(ierr);
13126d373c3eSHong Zhang       ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
13136d373c3eSHong Zhang       ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
13146d373c3eSHong Zhang 
1315618cf492SHong Zhang       c                  = (Mat_SeqAIJ*)(*C)->data;
13166d373c3eSHong Zhang       c->atb             = atb;
13176d373c3eSHong Zhang       atb->At            = At;
13186d373c3eSHong Zhang       atb->destroy       = (*C)->ops->destroy;
13196d373c3eSHong Zhang       (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
13206d373c3eSHong Zhang 
13216d373c3eSHong Zhang       break;
13225df89d91SHong Zhang     }
13236d373c3eSHong Zhang   }
13246d373c3eSHong Zhang   if (alg) {
13256d373c3eSHong Zhang     ierr = (*(*C)->ops->mattransposemultnumeric)(A,B,*C);CHKERRQ(ierr);
13266d373c3eSHong Zhang   } else if (!alg && scall == MAT_REUSE_MATRIX) {
13276d373c3eSHong Zhang     c   = (Mat_SeqAIJ*)(*C)->data;
13286d373c3eSHong Zhang     atb = c->atb;
13296d373c3eSHong Zhang     At  = atb->At;
13306d373c3eSHong Zhang     ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
13316d373c3eSHong Zhang     ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);CHKERRQ(ierr);
13326d373c3eSHong Zhang   }
13335df89d91SHong Zhang   PetscFunctionReturn(0);
13345df89d91SHong Zhang }
13355df89d91SHong Zhang 
133675648e8dSHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
13375df89d91SHong Zhang {
1338bc011b1eSHong Zhang   PetscErrorCode ierr;
1339bc011b1eSHong Zhang   Mat            At;
134038baddfdSBarry Smith   PetscInt       *ati,*atj;
1341bc011b1eSHong Zhang 
1342bc011b1eSHong Zhang   PetscFunctionBegin;
1343bc011b1eSHong Zhang   /* create symbolic At */
1344bc011b1eSHong Zhang   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13450298fd71SBarry Smith   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
134633d57670SJed Brown   ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
134704b858e0SBarry Smith   ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1348bc011b1eSHong Zhang 
1349bc011b1eSHong Zhang   /* get symbolic C=At*B */
1350bc011b1eSHong Zhang   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1351bc011b1eSHong Zhang 
1352bc011b1eSHong Zhang   /* clean up */
13536bf464f9SBarry Smith   ierr = MatDestroy(&At);CHKERRQ(ierr);
1354bc011b1eSHong Zhang   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
13556d373c3eSHong Zhang 
13566d373c3eSHong Zhang   (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ;
1357bc011b1eSHong Zhang   PetscFunctionReturn(0);
1358bc011b1eSHong Zhang }
1359bc011b1eSHong Zhang 
136075648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1361bc011b1eSHong Zhang {
1362bc011b1eSHong Zhang   PetscErrorCode ierr;
13630fbc74f4SHong Zhang   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1364d0f46423SBarry Smith   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1365d0f46423SBarry Smith   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1366d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1367aa1aec99SHong Zhang   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1368bc011b1eSHong Zhang 
1369bc011b1eSHong Zhang   PetscFunctionBegin;
1370aa1aec99SHong Zhang   if (!c->a) {
1371854ce69bSBarry Smith     ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
13722205254eSKarl Rupp 
1373aa1aec99SHong Zhang     c->a      = ca;
1374aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1375aa1aec99SHong Zhang   } else {
1376aa1aec99SHong Zhang     ca = c->a;
1377aa1aec99SHong Zhang   }
1378bc011b1eSHong Zhang   /* clear old values in C */
1379bc011b1eSHong Zhang   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1380bc011b1eSHong Zhang 
1381bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1382bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1383bc011b1eSHong Zhang     bj   = b->j + bi[i];
1384bc011b1eSHong Zhang     ba   = b->a + bi[i];
1385bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1386bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1387bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1388bc011b1eSHong Zhang       nextb = 0;
13890fbc74f4SHong Zhang       crow  = *aj++;
1390bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1391bc011b1eSHong Zhang       caj   = ca + ci[crow];
1392bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1393bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
13940fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
13950fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1396bc011b1eSHong Zhang           nextb++;
1397bc011b1eSHong Zhang         }
1398bc011b1eSHong Zhang       }
1399bc011b1eSHong Zhang       flops += 2*bnzi;
14000fbc74f4SHong Zhang       aa++;
1401bc011b1eSHong Zhang     }
1402bc011b1eSHong Zhang   }
1403bc011b1eSHong Zhang 
1404bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
1405bc011b1eSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1406bc011b1eSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1407bc011b1eSHong Zhang   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1408bc011b1eSHong Zhang   PetscFunctionReturn(0);
1409bc011b1eSHong Zhang }
14109123193aSHong Zhang 
1411150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
14129123193aSHong Zhang {
14139123193aSHong Zhang   PetscErrorCode ierr;
14149123193aSHong Zhang 
14159123193aSHong Zhang   PetscFunctionBegin;
14169123193aSHong Zhang   if (scall == MAT_INITIAL_MATRIX) {
14173ff4c91cSHong Zhang     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14189123193aSHong Zhang     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
14193ff4c91cSHong Zhang     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
14209123193aSHong Zhang   }
14213ff4c91cSHong Zhang   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14229123193aSHong Zhang   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
14234614e006SHong Zhang   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
14249123193aSHong Zhang   PetscFunctionReturn(0);
14259123193aSHong Zhang }
1426edd81eecSMatthew Knepley 
14279123193aSHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
14289123193aSHong Zhang {
14299123193aSHong Zhang   PetscErrorCode ierr;
14309123193aSHong Zhang 
14319123193aSHong Zhang   PetscFunctionBegin;
14325a586d82SBarry Smith   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
14332205254eSKarl Rupp 
1434d73949e8SHong Zhang   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14359123193aSHong Zhang   PetscFunctionReturn(0);
14369123193aSHong Zhang }
14379123193aSHong Zhang 
14389123193aSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
14399123193aSHong Zhang {
1440f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1441612438f1SStefano Zampini   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
14429123193aSHong Zhang   PetscErrorCode    ierr;
1443daf57211SHong Zhang   PetscScalar       *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1444daf57211SHong Zhang   const PetscScalar *aa,*b1,*b2,*b3,*b4;
1445daf57211SHong Zhang   const PetscInt    *aj;
1446612438f1SStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1447daf57211SHong Zhang   PetscInt          am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
14489123193aSHong Zhang 
14499123193aSHong Zhang   PetscFunctionBegin;
1450f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
1451612438f1SStefano Zampini   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1452e32f2f54SBarry Smith   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1453e32f2f54SBarry Smith   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1454612438f1SStefano Zampini   b = bd->v;
14558c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1456f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1457730858b9SHong Zhang   c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1458f32d5d43SBarry Smith   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1459f32d5d43SBarry Smith     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1460f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1461f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1462f32d5d43SBarry Smith       aj = a->j + a->i[i];
1463f32d5d43SBarry Smith       aa = a->a + a->i[i];
1464f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1465730858b9SHong Zhang         aatmp = aa[j]; ajtmp = aj[j];
1466730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1467730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1468730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1469730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
14709123193aSHong Zhang       }
1471730858b9SHong Zhang       c1[i] = r1;
1472730858b9SHong Zhang       c2[i] = r2;
1473730858b9SHong Zhang       c3[i] = r3;
1474730858b9SHong Zhang       c4[i] = r4;
1475f32d5d43SBarry Smith     }
1476730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1477730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1478f32d5d43SBarry Smith   }
1479f32d5d43SBarry Smith   for (; col<cn; col++) {   /* over extra columns of C */
1480f32d5d43SBarry Smith     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1481f32d5d43SBarry Smith       r1 = 0.0;
1482f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1483f32d5d43SBarry Smith       aj = a->j + a->i[i];
1484f32d5d43SBarry Smith       aa = a->a + a->i[i];
1485f32d5d43SBarry Smith       for (j=0; j<n; j++) {
1486730858b9SHong Zhang         r1 += aa[j]*b1[aj[j]];
1487f32d5d43SBarry Smith       }
1488730858b9SHong Zhang       c1[i] = r1;
1489f32d5d43SBarry Smith     }
1490f32d5d43SBarry Smith     b1 += bm;
1491730858b9SHong Zhang     c1 += am;
1492f32d5d43SBarry Smith   }
1493dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
14948c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1495f32d5d43SBarry Smith   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1496f32d5d43SBarry Smith   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1497f32d5d43SBarry Smith   PetscFunctionReturn(0);
1498f32d5d43SBarry Smith }
1499f32d5d43SBarry Smith 
1500f32d5d43SBarry Smith /*
15014324174eSBarry Smith    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1502f32d5d43SBarry Smith */
1503f32d5d43SBarry Smith PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1504f32d5d43SBarry Smith {
1505f32d5d43SBarry Smith   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
150690f5ea3eSStefano Zampini   Mat_SeqDense   *bd = (Mat_SeqDense*)B->data;
1507f32d5d43SBarry Smith   PetscErrorCode ierr;
1508dd6ea824SBarry Smith   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1509dd6ea824SBarry Smith   MatScalar      *aa;
151090f5ea3eSStefano Zampini   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
15114324174eSBarry Smith   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1512f32d5d43SBarry Smith 
1513f32d5d43SBarry Smith   PetscFunctionBegin;
1514f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
151590f5ea3eSStefano Zampini   b = bd->v;
15168c778c55SBarry Smith   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1517f32d5d43SBarry Smith   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
15184324174eSBarry Smith 
15194324174eSBarry Smith   if (a->compressedrow.use) { /* use compressed row format */
15204324174eSBarry Smith     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
15214324174eSBarry Smith       colam = col*am;
15224324174eSBarry Smith       arm   = a->compressedrow.nrows;
15234324174eSBarry Smith       ii    = a->compressedrow.i;
15244324174eSBarry Smith       ridx  = a->compressedrow.rindex;
15254324174eSBarry Smith       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
15264324174eSBarry Smith         r1 = r2 = r3 = r4 = 0.0;
15274324174eSBarry Smith         n  = ii[i+1] - ii[i];
15284324174eSBarry Smith         aj = a->j + ii[i];
15294324174eSBarry Smith         aa = a->a + ii[i];
15304324174eSBarry Smith         for (j=0; j<n; j++) {
15314324174eSBarry Smith           r1 += (*aa)*b1[*aj];
15324324174eSBarry Smith           r2 += (*aa)*b2[*aj];
15334324174eSBarry Smith           r3 += (*aa)*b3[*aj];
15344324174eSBarry Smith           r4 += (*aa++)*b4[*aj++];
15354324174eSBarry Smith         }
15364324174eSBarry Smith         c[colam       + ridx[i]] += r1;
15374324174eSBarry Smith         c[colam + am  + ridx[i]] += r2;
15384324174eSBarry Smith         c[colam + am2 + ridx[i]] += r3;
15394324174eSBarry Smith         c[colam + am3 + ridx[i]] += r4;
15404324174eSBarry Smith       }
15414324174eSBarry Smith       b1 += bm4;
15424324174eSBarry Smith       b2 += bm4;
15434324174eSBarry Smith       b3 += bm4;
15444324174eSBarry Smith       b4 += bm4;
15454324174eSBarry Smith     }
15464324174eSBarry Smith     for (; col<cn; col++) {     /* over extra columns of C */
15474324174eSBarry Smith       colam = col*am;
15484324174eSBarry Smith       arm   = a->compressedrow.nrows;
15494324174eSBarry Smith       ii    = a->compressedrow.i;
15504324174eSBarry Smith       ridx  = a->compressedrow.rindex;
15514324174eSBarry Smith       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
15524324174eSBarry Smith         r1 = 0.0;
15534324174eSBarry Smith         n  = ii[i+1] - ii[i];
15544324174eSBarry Smith         aj = a->j + ii[i];
15554324174eSBarry Smith         aa = a->a + ii[i];
15564324174eSBarry Smith 
15574324174eSBarry Smith         for (j=0; j<n; j++) {
15584324174eSBarry Smith           r1 += (*aa++)*b1[*aj++];
15594324174eSBarry Smith         }
1560a2ea699eSBarry Smith         c[colam + ridx[i]] += r1;
15614324174eSBarry Smith       }
15624324174eSBarry Smith       b1 += bm;
15634324174eSBarry Smith     }
15644324174eSBarry Smith   } else {
1565f32d5d43SBarry Smith     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1566f32d5d43SBarry Smith       colam = col*am;
1567f32d5d43SBarry Smith       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1568f32d5d43SBarry Smith         r1 = r2 = r3 = r4 = 0.0;
1569f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1570f32d5d43SBarry Smith         aj = a->j + a->i[i];
1571f32d5d43SBarry Smith         aa = a->a + a->i[i];
1572f32d5d43SBarry Smith         for (j=0; j<n; j++) {
1573f32d5d43SBarry Smith           r1 += (*aa)*b1[*aj];
1574f32d5d43SBarry Smith           r2 += (*aa)*b2[*aj];
1575f32d5d43SBarry Smith           r3 += (*aa)*b3[*aj];
1576f32d5d43SBarry Smith           r4 += (*aa++)*b4[*aj++];
1577f32d5d43SBarry Smith         }
1578f32d5d43SBarry Smith         c[colam + i]       += r1;
1579f32d5d43SBarry Smith         c[colam + am + i]  += r2;
1580f32d5d43SBarry Smith         c[colam + am2 + i] += r3;
1581f32d5d43SBarry Smith         c[colam + am3 + i] += r4;
1582f32d5d43SBarry Smith       }
1583f32d5d43SBarry Smith       b1 += bm4;
1584f32d5d43SBarry Smith       b2 += bm4;
1585f32d5d43SBarry Smith       b3 += bm4;
1586f32d5d43SBarry Smith       b4 += bm4;
1587f32d5d43SBarry Smith     }
1588f32d5d43SBarry Smith     for (; col<cn; col++) {     /* over extra columns of C */
1589a2ea699eSBarry Smith       colam = col*am;
1590f32d5d43SBarry Smith       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1591f32d5d43SBarry Smith         r1 = 0.0;
1592f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1593f32d5d43SBarry Smith         aj = a->j + a->i[i];
1594f32d5d43SBarry Smith         aa = a->a + a->i[i];
1595f32d5d43SBarry Smith 
1596f32d5d43SBarry Smith         for (j=0; j<n; j++) {
1597f32d5d43SBarry Smith           r1 += (*aa++)*b1[*aj++];
1598f32d5d43SBarry Smith         }
1599a2ea699eSBarry Smith         c[colam + i] += r1;
1600f32d5d43SBarry Smith       }
1601f32d5d43SBarry Smith       b1 += bm;
1602f32d5d43SBarry Smith     }
16034324174eSBarry Smith   }
1604dc0b31edSSatish Balay   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
16058c778c55SBarry Smith   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
16069123193aSHong Zhang   PetscFunctionReturn(0);
16079123193aSHong Zhang }
1608b1683b59SHong Zhang 
1609b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1610c8db22f6SHong Zhang {
1611c8db22f6SHong Zhang   PetscErrorCode ierr;
16122f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
16132f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
16142f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
16152f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
16162f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
16172f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1618c8db22f6SHong Zhang 
1619c8db22f6SHong Zhang   PetscFunctionBegin;
16202f5f1f90SHong Zhang   btval_den=btdense->v;
16212f5f1f90SHong Zhang   ierr     = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
16222f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
16232f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
16242f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1625d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
16262f5f1f90SHong Zhang       btcol = bj + bi[col];
16272f5f1f90SHong Zhang       btval = ba + bi[col];
16282f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1629d2853540SHong Zhang       for (j=0; j<anz; j++) {
16302f5f1f90SHong Zhang         brow            = btcol[j];
16312f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1632c8db22f6SHong Zhang       }
1633c8db22f6SHong Zhang     }
16342f5f1f90SHong Zhang     btval_den += m;
1635c8db22f6SHong Zhang   }
1636c8db22f6SHong Zhang   PetscFunctionReturn(0);
1637c8db22f6SHong Zhang }
1638c8db22f6SHong Zhang 
1639b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
16408972f759SHong Zhang {
16418972f759SHong Zhang   PetscErrorCode ierr;
1642b2d2b04fSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1643077f23c2SHong Zhang   PetscScalar    *ca_den,*ca_den_ptr,*ca=csp->a;
1644f99a636bSHong Zhang   PetscInt       k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1645e88777f2SHong Zhang   PetscInt       brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1646077f23c2SHong Zhang   PetscInt       nrows,*row,*idx;
1647077f23c2SHong Zhang   PetscInt       *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
16488972f759SHong Zhang 
16498972f759SHong Zhang   PetscFunctionBegin;
1650a3fe58edSHong Zhang   ierr   = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr);
1651f99a636bSHong Zhang 
1652077f23c2SHong Zhang   if (brows > 0) {
1653077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1654980ae229SHong Zhang     lstart = matcoloring->lstart;
1655eeb4fd02SHong Zhang     ierr = PetscMemzero(lstart,ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1656eeb4fd02SHong Zhang 
1657077f23c2SHong Zhang     row_end = brows;
1658eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1659077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1660077f23c2SHong Zhang       ca_den_ptr = ca_den;
1661980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1662eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1663eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1664eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1665eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1666eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1667eeb4fd02SHong Zhang             lstart[k] = l;
1668eeb4fd02SHong Zhang             break;
1669eeb4fd02SHong Zhang           } else {
1670077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1671eeb4fd02SHong Zhang           }
1672eeb4fd02SHong Zhang         }
1673077f23c2SHong Zhang         ca_den_ptr += m;
1674eeb4fd02SHong Zhang       }
1675077f23c2SHong Zhang       row_end += brows;
1676eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1677eeb4fd02SHong Zhang     }
1678077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1679077f23c2SHong Zhang     ca_den_ptr = ca_den;
1680077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1681077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1682077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1683077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1684077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1685077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1686077f23c2SHong Zhang       }
1687077f23c2SHong Zhang       ca_den_ptr += m;
1688077f23c2SHong Zhang     }
1689f99a636bSHong Zhang   }
1690f99a636bSHong Zhang 
1691a3fe58edSHong Zhang   ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1692f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1693077f23c2SHong Zhang   if (matcoloring->brows > 0) {
1694f99a636bSHong Zhang     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1695e88777f2SHong Zhang   } else {
1696077f23c2SHong Zhang     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1697e88777f2SHong Zhang   }
1698f99a636bSHong Zhang #endif
16998972f759SHong Zhang   PetscFunctionReturn(0);
17008972f759SHong Zhang }
17018972f759SHong Zhang 
1702b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1703b1683b59SHong Zhang {
1704b1683b59SHong Zhang   PetscErrorCode ierr;
1705e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
17061a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1707b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1708b1683b59SHong Zhang   IS             *isa;
1709b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1710e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1711e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1712e88777f2SHong Zhang   PetscBool      flg;
1713b1683b59SHong Zhang 
1714b1683b59SHong Zhang   PetscFunctionBegin;
1715b1683b59SHong Zhang   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1716e99be685SHong Zhang 
17177ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1718e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1719b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1720e88777f2SHong Zhang   c->N      = Nbs;
1721e88777f2SHong Zhang   c->m      = c->M;
1722b1683b59SHong Zhang   c->rstart = 0;
1723e88777f2SHong Zhang   c->brows  = 100;
1724b1683b59SHong Zhang 
1725b1683b59SHong Zhang   c->ncolors = nis;
1726dcca6d9dSJed Brown   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1727854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1728854ce69bSBarry Smith   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1729e88777f2SHong Zhang 
1730e88777f2SHong Zhang   brows = c->brows;
1731c5929fdfSBarry Smith   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1732e88777f2SHong Zhang   if (flg) c->brows = brows;
1733eeb4fd02SHong Zhang   if (brows > 0) {
1734854ce69bSBarry Smith     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1735e88777f2SHong Zhang   }
17362205254eSKarl Rupp 
1737d2853540SHong Zhang   colorforrow[0] = 0;
1738d2853540SHong Zhang   rows_i         = rows;
1739f99a636bSHong Zhang   den2sp_i       = den2sp;
1740b1683b59SHong Zhang 
1741854ce69bSBarry Smith   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1742854ce69bSBarry Smith   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
17432205254eSKarl Rupp 
1744d2853540SHong Zhang   colorforcol[0] = 0;
1745d2853540SHong Zhang   columns_i      = columns;
1746d2853540SHong Zhang 
1747077f23c2SHong Zhang   /* get column-wise storage of mat */
1748077f23c2SHong Zhang   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1749b1683b59SHong Zhang 
1750b28d80bdSHong Zhang   cm   = c->m;
1751854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1752854ce69bSBarry Smith   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1753f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
1754b1683b59SHong Zhang     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1755b1683b59SHong Zhang     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
17562205254eSKarl Rupp 
1757b1683b59SHong Zhang     c->ncolumns[i] = n;
1758b1683b59SHong Zhang     if (n) {
1759d2853540SHong Zhang       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1760b1683b59SHong Zhang     }
1761d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1762d2853540SHong Zhang     columns_i       += n;
1763b1683b59SHong Zhang 
1764b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
1765e8354b3eSHong Zhang     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1766e99be685SHong Zhang 
1767b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1768b1683b59SHong Zhang       col     = is[j];
1769d2853540SHong Zhang       row_idx = cj + ci[col];
1770b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1771b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1772e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1773d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1774b1683b59SHong Zhang       }
1775b1683b59SHong Zhang     }
1776b1683b59SHong Zhang     /* count the number of hits */
1777b1683b59SHong Zhang     nrows = 0;
1778e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1779b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1780b1683b59SHong Zhang     }
1781b1683b59SHong Zhang     c->nrows[i]      = nrows;
1782d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1783d2853540SHong Zhang 
1784b1683b59SHong Zhang     nrows = 0;
1785b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1786b1683b59SHong Zhang       if (rowhit[j]) {
1787d2853540SHong Zhang         rows_i[nrows]   = j;
178812b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1789b1683b59SHong Zhang         nrows++;
1790b1683b59SHong Zhang       }
1791b1683b59SHong Zhang     }
1792e88777f2SHong Zhang     den2sp_i += nrows;
1793077f23c2SHong Zhang 
1794b1683b59SHong Zhang     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1795f99a636bSHong Zhang     rows_i += nrows;
1796b1683b59SHong Zhang   }
17970298fd71SBarry Smith   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1798b28d80bdSHong Zhang   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1799b1683b59SHong Zhang   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1800d2853540SHong Zhang   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1801b28d80bdSHong Zhang 
1802d2853540SHong Zhang   c->colorforrow = colorforrow;
1803d2853540SHong Zhang   c->rows        = rows;
1804f99a636bSHong Zhang   c->den2sp      = den2sp;
1805d2853540SHong Zhang   c->colorforcol = colorforcol;
1806d2853540SHong Zhang   c->columns     = columns;
18072205254eSKarl Rupp 
1808f94ccd6cSHong Zhang   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1809b1683b59SHong Zhang   PetscFunctionReturn(0);
1810b1683b59SHong Zhang }
1811d013fc79Sandi selinger 
181273b67375Sandi selinger /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */
1813d013fc79Sandi selinger PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C)
1814d013fc79Sandi selinger {
1815d013fc79Sandi selinger   PetscErrorCode     ierr;
1816d013fc79Sandi selinger   PetscLogDouble     flops=0.0;
1817d013fc79Sandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
18182869b61bSandi selinger   const PetscInt     *ai=a->i,*bi=b->i;
1819d013fc79Sandi selinger   PetscInt           *ci,*cj,*cj_i;
1820d013fc79Sandi selinger   PetscScalar        *ca,*ca_i;
18212869b61bSandi selinger   PetscInt           b_maxmemrow,c_maxmem,a_col;
1822d013fc79Sandi selinger   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1823d013fc79Sandi selinger   PetscInt           i,k,ndouble=0;
1824d013fc79Sandi selinger   PetscReal          afill;
1825d013fc79Sandi selinger   PetscScalar        *c_row_val_dense;
1826d013fc79Sandi selinger   PetscBool          *c_row_idx_flags;
1827d013fc79Sandi selinger   PetscInt           *aj_i=a->j;
1828d013fc79Sandi selinger   PetscScalar        *aa_i=a->a;
1829d013fc79Sandi selinger 
1830d013fc79Sandi selinger   PetscFunctionBegin;
18312869b61bSandi selinger 
18322869b61bSandi selinger   /* Step 1: Determine upper bounds on memory for C and allocate memory */
18332869b61bSandi selinger   /* This should be enough for almost all matrices. If still more memory is needed, it is reallocated later. */
18342869b61bSandi selinger   c_maxmem    = 8*(ai[am]+bi[bm]);
18352869b61bSandi selinger   b_maxmemrow = PetscMin(bi[bm],bn);
1836d013fc79Sandi selinger   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1837d013fc79Sandi selinger   ierr  = PetscMalloc1(bn,&c_row_val_dense);CHKERRQ(ierr);
1838d013fc79Sandi selinger   ierr  = PetscMalloc1(bn,&c_row_idx_flags);CHKERRQ(ierr);
1839d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
1840d013fc79Sandi selinger   ierr  = PetscMalloc1(c_maxmem,&ca);CHKERRQ(ierr);
1841d013fc79Sandi selinger   ca_i  = ca;
1842d013fc79Sandi selinger   cj_i  = cj;
1843d013fc79Sandi selinger   ci[0] = 0;
184473b67375Sandi selinger   ierr  = PetscMemzero(c_row_val_dense,bn*sizeof(PetscScalar));CHKERRQ(ierr);
184573b67375Sandi selinger   ierr  = PetscMemzero(c_row_idx_flags,bn*sizeof(PetscBool));CHKERRQ(ierr);
1846d013fc79Sandi selinger   for (i=0; i<am; i++) {
1847d013fc79Sandi selinger     /* Step 2: Initialize the dense row vector for C  */
1848d013fc79Sandi selinger     const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1849d013fc79Sandi selinger     PetscInt       cnzi = 0;
1850d013fc79Sandi selinger     PetscInt       *bj_i;
1851d013fc79Sandi selinger     PetscScalar    *ba_i;
18522869b61bSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory
18532869b61bSandi selinger        Usually, there is enough memory in the first place, so this is not executed. */
18542869b61bSandi selinger     while (ci[i] + b_maxmemrow > c_maxmem) {
18552869b61bSandi selinger       c_maxmem *= 2;
18562869b61bSandi selinger       ndouble++;
1857928bb9adSStefano Zampini       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
1858928bb9adSStefano Zampini       ierr = PetscRealloc(sizeof(PetscScalar)*c_maxmem,&ca);CHKERRQ(ierr);
18592869b61bSandi selinger     }
1860d013fc79Sandi selinger 
1861d013fc79Sandi selinger     /* Step 3: Do the numerical calculations */
1862d013fc79Sandi selinger     for (a_col=0; a_col<anzi; a_col++) {          /* iterate over all non zero values in a row of A */
1863d013fc79Sandi selinger       PetscInt       a_col_index = aj_i[a_col];
1864d013fc79Sandi selinger       const PetscInt bnzi        = bi[a_col_index+1] - bi[a_col_index];
1865d013fc79Sandi selinger       flops += 2*bnzi;
1866d013fc79Sandi selinger       bj_i   = b->j + bi[a_col_index];   /* points to the current row in bj */
1867d013fc79Sandi selinger       ba_i   = b->a + bi[a_col_index];   /* points to the current row in ba */
1868d013fc79Sandi selinger       for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */
1869d013fc79Sandi selinger         if (c_row_idx_flags[bj_i[k]] == PETSC_FALSE) {
18702869b61bSandi selinger           cj_i[cnzi++]             = bj_i[k];
1871d013fc79Sandi selinger           c_row_idx_flags[bj_i[k]] = PETSC_TRUE;
1872d013fc79Sandi selinger         }
1873d013fc79Sandi selinger         c_row_val_dense[bj_i[k]] += aa_i[a_col] * ba_i[k];
1874d013fc79Sandi selinger       }
1875d013fc79Sandi selinger     }
1876d013fc79Sandi selinger 
1877d013fc79Sandi selinger     /* Sort array */
18783353a62bSKarl Rupp     ierr = PetscSortInt(cnzi,cj_i);CHKERRQ(ierr);
1879d013fc79Sandi selinger     /* Step 4 */
1880d013fc79Sandi selinger     for (k=0; k<cnzi; k++) {
1881d013fc79Sandi selinger       ca_i[k]                  = c_row_val_dense[cj_i[k]];
1882d013fc79Sandi selinger       c_row_val_dense[cj_i[k]] = 0.;
1883d013fc79Sandi selinger       c_row_idx_flags[cj_i[k]] = PETSC_FALSE;
1884d013fc79Sandi selinger     }
1885d013fc79Sandi selinger     /* terminate current row */
1886d013fc79Sandi selinger     aa_i   += anzi;
1887d013fc79Sandi selinger     aj_i   += anzi;
1888d013fc79Sandi selinger     ca_i   += cnzi;
1889d013fc79Sandi selinger     cj_i   += cnzi;
1890d013fc79Sandi selinger     ci[i+1] = ci[i] + cnzi;
1891d013fc79Sandi selinger     flops  += cnzi;
1892d013fc79Sandi selinger   }
1893d013fc79Sandi selinger 
1894d013fc79Sandi selinger   /* Step 5 */
1895d013fc79Sandi selinger   /* Create the new matrix */
1896d013fc79Sandi selinger   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
1897d013fc79Sandi selinger   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
189802fe1965SBarry Smith   ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1899d013fc79Sandi selinger 
1900d013fc79Sandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1901d013fc79Sandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1902d013fc79Sandi selinger   c          = (Mat_SeqAIJ*)((*C)->data);
1903d013fc79Sandi selinger   c->a       = ca;
1904d013fc79Sandi selinger   c->free_a  = PETSC_TRUE;
1905d013fc79Sandi selinger   c->free_ij = PETSC_TRUE;
1906d013fc79Sandi selinger   c->nonew   = 0;
1907d013fc79Sandi selinger 
1908d013fc79Sandi selinger   /* set MatInfo */
1909d013fc79Sandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1910d013fc79Sandi selinger   if (afill < 1.0) afill = 1.0;
1911d013fc79Sandi selinger   c->maxnz                     = ci[am];
1912d013fc79Sandi selinger   c->nz                        = ci[am];
1913d013fc79Sandi selinger   (*C)->info.mallocs           = ndouble;
1914d013fc79Sandi selinger   (*C)->info.fill_ratio_given  = fill;
1915d013fc79Sandi selinger   (*C)->info.fill_ratio_needed = afill;
1916d013fc79Sandi selinger 
191773b67375Sandi selinger   ierr = PetscFree(c_row_val_dense);CHKERRQ(ierr);
191873b67375Sandi selinger   ierr = PetscFree(c_row_idx_flags);CHKERRQ(ierr);
1919d013fc79Sandi selinger 
1920d013fc79Sandi selinger   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1921d013fc79Sandi selinger   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1922d013fc79Sandi selinger   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1923d013fc79Sandi selinger   PetscFunctionReturn(0);
1924d013fc79Sandi selinger }
1925