1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 135e5acdf2Sstefano_zampini 14150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 1538baddfdSBarry Smith { 16dfbe8321SBarry Smith PetscErrorCode ierr; 17*df97dc6dSFande Kong 18*df97dc6dSFande Kong PetscFunctionBegin; 19*df97dc6dSFande Kong if (scall == MAT_INITIAL_MATRIX) { 20*df97dc6dSFande Kong ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 21*df97dc6dSFande Kong ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 22*df97dc6dSFande Kong ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 23*df97dc6dSFande Kong } 24*df97dc6dSFande Kong 25*df97dc6dSFande Kong ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 26*df97dc6dSFande Kong ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr); 27*df97dc6dSFande Kong ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 28*df97dc6dSFande Kong PetscFunctionReturn(0); 29*df97dc6dSFande Kong } 30*df97dc6dSFande Kong 31*df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 32*df97dc6dSFande Kong { 33*df97dc6dSFande Kong PetscErrorCode ierr; 34*df97dc6dSFande Kong 35*df97dc6dSFande Kong PetscFunctionBegin; 36*df97dc6dSFande Kong if (C->ops->matmultnumeric) { 37*df97dc6dSFande Kong ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr); 38*df97dc6dSFande Kong } else { 39*df97dc6dSFande Kong ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr); 40*df97dc6dSFande Kong } 41*df97dc6dSFande Kong PetscFunctionReturn(0); 42*df97dc6dSFande Kong } 43*df97dc6dSFande Kong 44*df97dc6dSFande Kong PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 45*df97dc6dSFande Kong { 46*df97dc6dSFande Kong PetscErrorCode ierr; 475e5acdf2Sstefano_zampini #if !defined(PETSC_HAVE_HYPRE) 48d7ed1a4dSandi selinger const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"}; 49d013fc79Sandi selinger PetscInt nalg = 8; 50d7ed1a4dSandi selinger #else 51d7ed1a4dSandi selinger const char *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"}; 52d7ed1a4dSandi selinger PetscInt nalg = 9; 535e5acdf2Sstefano_zampini #endif 546540a9cdSHong Zhang PetscInt alg = 0; /* set default algorithm */ 555c66b693SKris Buschelman 565c66b693SKris Buschelman PetscFunctionBegin; 57715a5346SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 585e5acdf2Sstefano_zampini ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr); 59d8bbc50fSBarry Smith ierr = PetscOptionsEnd();CHKERRQ(ierr); 606540a9cdSHong Zhang switch (alg) { 616540a9cdSHong Zhang case 1: 62aacf854cSBarry Smith ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 636540a9cdSHong Zhang break; 646540a9cdSHong Zhang case 2: 656540a9cdSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 666540a9cdSHong Zhang break; 676540a9cdSHong Zhang case 3: 680ced3a2bSJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 696540a9cdSHong Zhang break; 706540a9cdSHong Zhang case 4: 718a07c6f1SJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 726540a9cdSHong Zhang break; 736540a9cdSHong Zhang case 5: 7458cf0668SJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 756540a9cdSHong Zhang break; 765e5acdf2Sstefano_zampini case 6: 77d013fc79Sandi selinger ierr = MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);CHKERRQ(ierr); 78d013fc79Sandi selinger break; 79d013fc79Sandi selinger case 7: 80d7ed1a4dSandi selinger ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 81d7ed1a4dSandi selinger break; 82d7ed1a4dSandi selinger #if defined(PETSC_HAVE_HYPRE) 83d7ed1a4dSandi selinger case 8: 845e5acdf2Sstefano_zampini ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 855e5acdf2Sstefano_zampini break; 865e5acdf2Sstefano_zampini #endif 876540a9cdSHong Zhang default: 88*df97dc6dSFande Kong ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr); 896540a9cdSHong Zhang break; 9025023028SHong Zhang } 915c66b693SKris Buschelman PetscFunctionReturn(0); 925c66b693SKris Buschelman } 931c24bd37SHong Zhang 94*df97dc6dSFande Kong PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C) 95b561aa9dSHong Zhang { 96b561aa9dSHong Zhang PetscErrorCode ierr; 97b561aa9dSHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 98971236abSHong Zhang PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 99c1ba5575SJed Brown PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 100b561aa9dSHong Zhang PetscReal afill; 101eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 102eca6b86aSHong Zhang PetscTable ta; 103fb908031SHong Zhang PetscBT lnkbt; 1040298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 105b561aa9dSHong Zhang 106b561aa9dSHong Zhang PetscFunctionBegin; 107bd958071SHong Zhang /* Get ci and cj */ 108bd958071SHong Zhang /*---------------*/ 109fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 110fb908031SHong Zhang /* free space for accumulating nonzero column info */ 111854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 112fb908031SHong Zhang ci[0] = 0; 113fb908031SHong Zhang 114fb908031SHong Zhang /* create and initialize a linked list */ 115c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 116c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 117eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 118eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 119eca6b86aSHong Zhang 120eca6b86aSHong Zhang ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 121fb908031SHong Zhang 122fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 123f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 1242205254eSKarl Rupp 125fb908031SHong Zhang current_space = free_space; 126fb908031SHong Zhang 127fb908031SHong Zhang /* Determine ci and cj */ 128fb908031SHong Zhang for (i=0; i<am; i++) { 129fb908031SHong Zhang anzi = ai[i+1] - ai[i]; 130fb908031SHong Zhang aj = a->j + ai[i]; 131fb908031SHong Zhang for (j=0; j<anzi; j++) { 132fb908031SHong Zhang brow = aj[j]; 133fb908031SHong Zhang bnzj = bi[brow+1] - bi[brow]; 134fb908031SHong Zhang bj = b->j + bi[brow]; 135fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 136fb908031SHong Zhang ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 137fb908031SHong Zhang } 138fb908031SHong Zhang cnzi = lnk[0]; 139fb908031SHong Zhang 140fb908031SHong Zhang /* If free space is not available, make more free space */ 141fb908031SHong Zhang /* Double the amount of total space in the list */ 142fb908031SHong Zhang if (current_space->local_remaining<cnzi) { 143f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 144fb908031SHong Zhang ndouble++; 145fb908031SHong Zhang } 146fb908031SHong Zhang 147fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 148fb908031SHong Zhang ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1492205254eSKarl Rupp 150fb908031SHong Zhang current_space->array += cnzi; 151fb908031SHong Zhang current_space->local_used += cnzi; 152fb908031SHong Zhang current_space->local_remaining -= cnzi; 1532205254eSKarl Rupp 154fb908031SHong Zhang ci[i+1] = ci[i] + cnzi; 155fb908031SHong Zhang } 156fb908031SHong Zhang 157fb908031SHong Zhang /* Column indices are in the list of free space */ 158fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 159fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 160854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 161fb908031SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 162fb908031SHong Zhang ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 163b561aa9dSHong Zhang 16426be0446SHong Zhang /* put together the new symbolic matrix */ 165ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 16633d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 16702fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 16858c24d83SHong Zhang 16958c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 17058c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 17158c24d83SHong Zhang c = (Mat_SeqAIJ*)((*C)->data); 172aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 173e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 17458c24d83SHong Zhang c->nonew = 0; 175*df97dc6dSFande Kong (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; /* fast, needs non-scalable O(bn) array 'abdense' */ 1760b7e3e3dSHong Zhang 1777212da7cSHong Zhang /* set MatInfo */ 1787212da7cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 179f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 1807212da7cSHong Zhang c->maxnz = ci[am]; 1817212da7cSHong Zhang c->nz = ci[am]; 182fb908031SHong Zhang (*C)->info.mallocs = ndouble; 1837212da7cSHong Zhang (*C)->info.fill_ratio_given = fill; 1847212da7cSHong Zhang (*C)->info.fill_ratio_needed = afill; 1857212da7cSHong Zhang 1867212da7cSHong Zhang #if defined(PETSC_USE_INFO) 1877212da7cSHong Zhang if (ci[am]) { 18857622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 18957622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 190f2b054eeSHong Zhang } else { 191f2b054eeSHong Zhang ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 192be0fcf8dSHong Zhang } 193f2b054eeSHong Zhang #endif 19458c24d83SHong Zhang PetscFunctionReturn(0); 19558c24d83SHong Zhang } 196d50806bdSBarry Smith 197*df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 198d50806bdSBarry Smith { 199dfbe8321SBarry Smith PetscErrorCode ierr; 200d13dce4bSSatish Balay PetscLogDouble flops=0.0; 201d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 202d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 203d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 20438baddfdSBarry Smith PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 205c58ca2e3SHong Zhang PetscInt am =A->rmap->n,cm=C->rmap->n; 206519eb980SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 207aa1aec99SHong Zhang PetscScalar *aa=a->a,*ba=b->a,*baj,*ca,valtmp; 208aa1aec99SHong Zhang PetscScalar *ab_dense; 209d50806bdSBarry Smith 210d50806bdSBarry Smith PetscFunctionBegin; 211aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 212854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 213aa1aec99SHong Zhang c->a = ca; 214aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 215aa1aec99SHong Zhang } else { 216aa1aec99SHong Zhang ca = c->a; 217d90d86d0SMatthew G. Knepley } 218d90d86d0SMatthew G. Knepley if (!c->matmult_abdense) { 2191795a4d1SJed Brown ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 220d90d86d0SMatthew G. Knepley c->matmult_abdense = ab_dense; 221d90d86d0SMatthew G. Knepley } else { 222aa1aec99SHong Zhang ab_dense = c->matmult_abdense; 223aa1aec99SHong Zhang } 224aa1aec99SHong Zhang 225c124e916SHong Zhang /* clean old values in C */ 226c124e916SHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 227d50806bdSBarry Smith /* Traverse A row-wise. */ 228d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 229d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 230d50806bdSBarry Smith for (i=0; i<am; i++) { 231d50806bdSBarry Smith anzi = ai[i+1] - ai[i]; 232d50806bdSBarry Smith for (j=0; j<anzi; j++) { 233519eb980SHong Zhang brow = aj[j]; 234d50806bdSBarry Smith bnzi = bi[brow+1] - bi[brow]; 235d50806bdSBarry Smith bjj = bj + bi[brow]; 236d50806bdSBarry Smith baj = ba + bi[brow]; 237519eb980SHong Zhang /* perform dense axpy */ 23836ec6d2dSHong Zhang valtmp = aa[j]; 239519eb980SHong Zhang for (k=0; k<bnzi; k++) { 24036ec6d2dSHong Zhang ab_dense[bjj[k]] += valtmp*baj[k]; 241519eb980SHong Zhang } 242519eb980SHong Zhang flops += 2*bnzi; 243519eb980SHong Zhang } 244c58ca2e3SHong Zhang aj += anzi; aa += anzi; 245c58ca2e3SHong Zhang 246c58ca2e3SHong Zhang cnzi = ci[i+1] - ci[i]; 247519eb980SHong Zhang for (k=0; k<cnzi; k++) { 248519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 249519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 250519eb980SHong Zhang } 251519eb980SHong Zhang flops += cnzi; 252519eb980SHong Zhang cj += cnzi; ca += cnzi; 253519eb980SHong Zhang } 254c58ca2e3SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 255c58ca2e3SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 256c58ca2e3SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 257c58ca2e3SHong Zhang PetscFunctionReturn(0); 258c58ca2e3SHong Zhang } 259c58ca2e3SHong Zhang 26025023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 261c58ca2e3SHong Zhang { 262c58ca2e3SHong Zhang PetscErrorCode ierr; 263c58ca2e3SHong Zhang PetscLogDouble flops=0.0; 264c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 266c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 267c58ca2e3SHong Zhang PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 268c58ca2e3SHong Zhang PetscInt am = A->rmap->N,cm=C->rmap->N; 269c58ca2e3SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 27036ec6d2dSHong Zhang PetscScalar *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp; 271c58ca2e3SHong Zhang PetscInt nextb; 272c58ca2e3SHong Zhang 273c58ca2e3SHong Zhang PetscFunctionBegin; 274cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 275cf742fccSHong Zhang ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 276cf742fccSHong Zhang c->a = ca; 277cf742fccSHong Zhang c->free_a = PETSC_TRUE; 278cf742fccSHong Zhang } 279cf742fccSHong Zhang 280c58ca2e3SHong Zhang /* clean old values in C */ 281c58ca2e3SHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 282c58ca2e3SHong Zhang /* Traverse A row-wise. */ 283c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 284c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 285519eb980SHong Zhang for (i=0; i<am; i++) { 286519eb980SHong Zhang anzi = ai[i+1] - ai[i]; 287519eb980SHong Zhang cnzi = ci[i+1] - ci[i]; 288519eb980SHong Zhang for (j=0; j<anzi; j++) { 289519eb980SHong Zhang brow = aj[j]; 290519eb980SHong Zhang bnzi = bi[brow+1] - bi[brow]; 291519eb980SHong Zhang bjj = bj + bi[brow]; 292519eb980SHong Zhang baj = ba + bi[brow]; 293519eb980SHong Zhang /* perform sparse axpy */ 29436ec6d2dSHong Zhang valtmp = aa[j]; 295c124e916SHong Zhang nextb = 0; 296c124e916SHong Zhang for (k=0; nextb<bnzi; k++) { 297c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 29836ec6d2dSHong Zhang ca[k] += valtmp*baj[nextb++]; 299c124e916SHong Zhang } 300d50806bdSBarry Smith } 301d50806bdSBarry Smith flops += 2*bnzi; 302d50806bdSBarry Smith } 303519eb980SHong Zhang aj += anzi; aa += anzi; 304519eb980SHong Zhang cj += cnzi; ca += cnzi; 305d50806bdSBarry Smith } 306c58ca2e3SHong Zhang 307716bacf3SKris Buschelman ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 308716bacf3SKris Buschelman ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 309d50806bdSBarry Smith ierr = PetscLogFlops(flops);CHKERRQ(ierr); 310d50806bdSBarry Smith PetscFunctionReturn(0); 311d50806bdSBarry Smith } 312bc011b1eSHong Zhang 3133c50cad2SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C) 31425296bd5SBarry Smith { 31525296bd5SBarry Smith PetscErrorCode ierr; 31625296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 31725296bd5SBarry Smith PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 3183c50cad2SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 31925296bd5SBarry Smith MatScalar *ca; 32025296bd5SBarry Smith PetscReal afill; 321eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 322eca6b86aSHong Zhang PetscTable ta; 3230298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 32425296bd5SBarry Smith 32525296bd5SBarry Smith PetscFunctionBegin; 3263c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 3273c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 3283c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 329854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 3303c50cad2SHong Zhang ci[0] = 0; 3313c50cad2SHong Zhang 3323c50cad2SHong Zhang /* create and initialize a linked list */ 333c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 334c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 335eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 336eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 337eca6b86aSHong Zhang 338eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 3393c50cad2SHong Zhang 3403c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 341f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 3423c50cad2SHong Zhang current_space = free_space; 3433c50cad2SHong Zhang 3443c50cad2SHong Zhang /* Determine ci and cj */ 3453c50cad2SHong Zhang for (i=0; i<am; i++) { 3463c50cad2SHong Zhang anzi = ai[i+1] - ai[i]; 3473c50cad2SHong Zhang aj = a->j + ai[i]; 3483c50cad2SHong Zhang for (j=0; j<anzi; j++) { 3493c50cad2SHong Zhang brow = aj[j]; 3503c50cad2SHong Zhang bnzj = bi[brow+1] - bi[brow]; 3513c50cad2SHong Zhang bj = b->j + bi[brow]; 3523c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 3533c50cad2SHong Zhang ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 3543c50cad2SHong Zhang } 3553c50cad2SHong Zhang cnzi = lnk[1]; 3563c50cad2SHong Zhang 3573c50cad2SHong Zhang /* If free space is not available, make more free space */ 3583c50cad2SHong Zhang /* Double the amount of total space in the list */ 3593c50cad2SHong Zhang if (current_space->local_remaining<cnzi) { 360f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 3613c50cad2SHong Zhang ndouble++; 3623c50cad2SHong Zhang } 3633c50cad2SHong Zhang 3643c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 3653c50cad2SHong Zhang ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 3662205254eSKarl Rupp 3673c50cad2SHong Zhang current_space->array += cnzi; 3683c50cad2SHong Zhang current_space->local_used += cnzi; 3693c50cad2SHong Zhang current_space->local_remaining -= cnzi; 3702205254eSKarl Rupp 3713c50cad2SHong Zhang ci[i+1] = ci[i] + cnzi; 3723c50cad2SHong Zhang } 3733c50cad2SHong Zhang 3743c50cad2SHong Zhang /* Column indices are in the list of free space */ 3753c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 3763c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 377854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 3783c50cad2SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 3793c50cad2SHong Zhang ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 38025296bd5SBarry Smith 38125296bd5SBarry Smith /* Allocate space for ca */ 382854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 38325296bd5SBarry Smith ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr); 38425296bd5SBarry Smith 38525296bd5SBarry Smith /* put together the new symbolic matrix */ 386ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr); 38733d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 38802fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 38925296bd5SBarry Smith 39025296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 39125296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 39225296bd5SBarry Smith c = (Mat_SeqAIJ*)((*C)->data); 39325296bd5SBarry Smith c->free_a = PETSC_TRUE; 39425296bd5SBarry Smith c->free_ij = PETSC_TRUE; 39525296bd5SBarry Smith c->nonew = 0; 3962205254eSKarl Rupp 39725296bd5SBarry Smith (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */ 39825296bd5SBarry Smith 39925296bd5SBarry Smith /* set MatInfo */ 40025296bd5SBarry Smith afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 40125296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 40225296bd5SBarry Smith c->maxnz = ci[am]; 40325296bd5SBarry Smith c->nz = ci[am]; 4043c50cad2SHong Zhang (*C)->info.mallocs = ndouble; 40525296bd5SBarry Smith (*C)->info.fill_ratio_given = fill; 40625296bd5SBarry Smith (*C)->info.fill_ratio_needed = afill; 40725296bd5SBarry Smith 40825296bd5SBarry Smith #if defined(PETSC_USE_INFO) 40925296bd5SBarry Smith if (ci[am]) { 41057622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 41157622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 41225296bd5SBarry Smith } else { 41325296bd5SBarry Smith ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 41425296bd5SBarry Smith } 41525296bd5SBarry Smith #endif 41625296bd5SBarry Smith PetscFunctionReturn(0); 41725296bd5SBarry Smith } 41825296bd5SBarry Smith 41925296bd5SBarry Smith 42025023028SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C) 421e9e4536cSHong Zhang { 422e9e4536cSHong Zhang PetscErrorCode ierr; 423e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 424bf9555e6SHong Zhang PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 42525c41797SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 426e9e4536cSHong Zhang MatScalar *ca; 427e9e4536cSHong Zhang PetscReal afill; 428eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 429eca6b86aSHong Zhang PetscTable ta; 4300298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 431e9e4536cSHong Zhang 432e9e4536cSHong Zhang PetscFunctionBegin; 433bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 434bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 435bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 436854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 437bd958071SHong Zhang ci[0] = 0; 438bd958071SHong Zhang 439bd958071SHong Zhang /* create and initialize a linked list */ 440c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 441c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 442eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 443eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 444eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 445bd958071SHong Zhang 446bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 447f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 448bd958071SHong Zhang current_space = free_space; 449bd958071SHong Zhang 450bd958071SHong Zhang /* Determine ci and cj */ 451bd958071SHong Zhang for (i=0; i<am; i++) { 452bd958071SHong Zhang anzi = ai[i+1] - ai[i]; 453bd958071SHong Zhang aj = a->j + ai[i]; 454bd958071SHong Zhang for (j=0; j<anzi; j++) { 455bd958071SHong Zhang brow = aj[j]; 456bd958071SHong Zhang bnzj = bi[brow+1] - bi[brow]; 457bd958071SHong Zhang bj = b->j + bi[brow]; 458bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 459bd958071SHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 460bd958071SHong Zhang } 461bd958071SHong Zhang cnzi = lnk[0]; 462bd958071SHong Zhang 463bd958071SHong Zhang /* If free space is not available, make more free space */ 464bd958071SHong Zhang /* Double the amount of total space in the list */ 465bd958071SHong Zhang if (current_space->local_remaining<cnzi) { 466f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 467bd958071SHong Zhang ndouble++; 468bd958071SHong Zhang } 469bd958071SHong Zhang 470bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 471bd958071SHong Zhang ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 4722205254eSKarl Rupp 473bd958071SHong Zhang current_space->array += cnzi; 474bd958071SHong Zhang current_space->local_used += cnzi; 475bd958071SHong Zhang current_space->local_remaining -= cnzi; 4762205254eSKarl Rupp 477bd958071SHong Zhang ci[i+1] = ci[i] + cnzi; 478bd958071SHong Zhang } 479bd958071SHong Zhang 480bd958071SHong Zhang /* Column indices are in the list of free space */ 481bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 482bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 483854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 484bd958071SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 485bd958071SHong Zhang ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 486e9e4536cSHong Zhang 487e9e4536cSHong Zhang /* Allocate space for ca */ 488bd958071SHong Zhang /*-----------------------*/ 489854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 490e9e4536cSHong Zhang ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr); 491e9e4536cSHong Zhang 492e9e4536cSHong Zhang /* put together the new symbolic matrix */ 493ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr); 49433d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 49502fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 496e9e4536cSHong Zhang 497e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 498e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 499e9e4536cSHong Zhang c = (Mat_SeqAIJ*)((*C)->data); 500e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 501e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 502e9e4536cSHong Zhang c->nonew = 0; 5032205254eSKarl Rupp 50425023028SHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */ 505e9e4536cSHong Zhang 506e9e4536cSHong Zhang /* set MatInfo */ 507e9e4536cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 508e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 509e9e4536cSHong Zhang c->maxnz = ci[am]; 510e9e4536cSHong Zhang c->nz = ci[am]; 511bd958071SHong Zhang (*C)->info.mallocs = ndouble; 512e9e4536cSHong Zhang (*C)->info.fill_ratio_given = fill; 513e9e4536cSHong Zhang (*C)->info.fill_ratio_needed = afill; 514e9e4536cSHong Zhang 515e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 516e9e4536cSHong Zhang if (ci[am]) { 51757622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 51857622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 519e9e4536cSHong Zhang } else { 520e9e4536cSHong Zhang ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 521e9e4536cSHong Zhang } 522e9e4536cSHong Zhang #endif 523e9e4536cSHong Zhang PetscFunctionReturn(0); 524e9e4536cSHong Zhang } 525e9e4536cSHong Zhang 5260ced3a2bSJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C) 5270ced3a2bSJed Brown { 5280ced3a2bSJed Brown PetscErrorCode ierr; 5290ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 5300ced3a2bSJed Brown const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 5310ced3a2bSJed Brown PetscInt *ci,*cj,*bb; 5320ced3a2bSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 5330ced3a2bSJed Brown PetscReal afill; 5340ced3a2bSJed Brown PetscInt i,j,col,ndouble = 0; 5350298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 5360ced3a2bSJed Brown PetscHeap h; 5370ced3a2bSJed Brown 5380ced3a2bSJed Brown PetscFunctionBegin; 539cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 5400ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 5410ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 542854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 5430ced3a2bSJed Brown ci[0] = 0; 5440ced3a2bSJed Brown 5450ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 546f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 5470ced3a2bSJed Brown current_space = free_space; 5480ced3a2bSJed Brown 5490ced3a2bSJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 550785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 5510ced3a2bSJed Brown 5520ced3a2bSJed Brown /* Determine ci and cj */ 5530ced3a2bSJed Brown for (i=0; i<am; i++) { 5540ced3a2bSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 5550ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 5560ced3a2bSJed Brown ci[i+1] = ci[i]; 5570ced3a2bSJed Brown /* Populate the min heap */ 5580ced3a2bSJed Brown for (j=0; j<anzi; j++) { 5590ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 5600ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 5610ced3a2bSJed Brown ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 5620ced3a2bSJed Brown } 5630ced3a2bSJed Brown } 5640ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 5650ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 5660ced3a2bSJed Brown while (j >= 0) { 5670ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 568f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 5690ced3a2bSJed Brown ndouble++; 5700ced3a2bSJed Brown } 5710ced3a2bSJed Brown *(current_space->array++) = col; 5720ced3a2bSJed Brown current_space->local_used++; 5730ced3a2bSJed Brown current_space->local_remaining--; 5740ced3a2bSJed Brown ci[i+1]++; 5750ced3a2bSJed Brown 5760ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 5770ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 5780ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 5790ced3a2bSJed Brown PetscInt j2,col2; 5800ced3a2bSJed Brown ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 5810ced3a2bSJed Brown if (col2 != col) break; 5820ced3a2bSJed Brown ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 5830ced3a2bSJed Brown if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 5840ced3a2bSJed Brown } 5850ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 5860ced3a2bSJed Brown ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 5870ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 5880ced3a2bSJed Brown } 5890ced3a2bSJed Brown } 5900ced3a2bSJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 5910ced3a2bSJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 5920ced3a2bSJed Brown 5930ced3a2bSJed Brown /* Column indices are in the list of free space */ 5940ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 5950ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 596785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 5970ced3a2bSJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 5980ced3a2bSJed Brown 5990ced3a2bSJed Brown /* put together the new symbolic matrix */ 600ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 60133d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 60202fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 6030ced3a2bSJed Brown 6040ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6050ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 6060ced3a2bSJed Brown c = (Mat_SeqAIJ*)((*C)->data); 6070ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6080ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 6090ced3a2bSJed Brown c->nonew = 0; 61026fbe8dcSKarl Rupp 611*df97dc6dSFande Kong (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 6120ced3a2bSJed Brown 6130ced3a2bSJed Brown /* set MatInfo */ 6140ced3a2bSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 6150ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6160ced3a2bSJed Brown c->maxnz = ci[am]; 6170ced3a2bSJed Brown c->nz = ci[am]; 6180ced3a2bSJed Brown (*C)->info.mallocs = ndouble; 6190ced3a2bSJed Brown (*C)->info.fill_ratio_given = fill; 6200ced3a2bSJed Brown (*C)->info.fill_ratio_needed = afill; 6210ced3a2bSJed Brown 6220ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 6230ced3a2bSJed Brown if (ci[am]) { 62457622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 62557622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 6260ced3a2bSJed Brown } else { 6270ced3a2bSJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 6280ced3a2bSJed Brown } 6290ced3a2bSJed Brown #endif 6300ced3a2bSJed Brown PetscFunctionReturn(0); 6310ced3a2bSJed Brown } 632e9e4536cSHong Zhang 6338a07c6f1SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C) 6348a07c6f1SJed Brown { 6358a07c6f1SJed Brown PetscErrorCode ierr; 6368a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 6378a07c6f1SJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 6388a07c6f1SJed Brown PetscInt *ci,*cj,*bb; 6398a07c6f1SJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 6408a07c6f1SJed Brown PetscReal afill; 6418a07c6f1SJed Brown PetscInt i,j,col,ndouble = 0; 6420298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 6438a07c6f1SJed Brown PetscHeap h; 6448a07c6f1SJed Brown PetscBT bt; 6458a07c6f1SJed Brown 6468a07c6f1SJed Brown PetscFunctionBegin; 647cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 6488a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 6498a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 650854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 6518a07c6f1SJed Brown ci[0] = 0; 6528a07c6f1SJed Brown 6538a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 654f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 6552205254eSKarl Rupp 6568a07c6f1SJed Brown current_space = free_space; 6578a07c6f1SJed Brown 6588a07c6f1SJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 659785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 6608a07c6f1SJed Brown ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 6618a07c6f1SJed Brown 6628a07c6f1SJed Brown /* Determine ci and cj */ 6638a07c6f1SJed Brown for (i=0; i<am; i++) { 6648a07c6f1SJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6658a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6668a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 6678a07c6f1SJed Brown ci[i+1] = ci[i]; 6688a07c6f1SJed Brown /* Populate the min heap */ 6698a07c6f1SJed Brown for (j=0; j<anzi; j++) { 6708a07c6f1SJed Brown PetscInt brow = acol[j]; 6718a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 6728a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 6738a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 6748a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 6758a07c6f1SJed Brown bb[j]++; 6768a07c6f1SJed Brown break; 6778a07c6f1SJed Brown } 6788a07c6f1SJed Brown } 6798a07c6f1SJed Brown } 6808a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 6818a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6828a07c6f1SJed Brown while (j >= 0) { 6838a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 6840298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 685f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 6868a07c6f1SJed Brown ndouble++; 6878a07c6f1SJed Brown } 6888a07c6f1SJed Brown *(current_space->array++) = col; 6898a07c6f1SJed Brown current_space->local_used++; 6908a07c6f1SJed Brown current_space->local_remaining--; 6918a07c6f1SJed Brown ci[i+1]++; 6928a07c6f1SJed Brown 6938a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 6948a07c6f1SJed Brown for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 6958a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 6968a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 6978a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 6988a07c6f1SJed Brown bb[j]++; 6998a07c6f1SJed Brown break; 7008a07c6f1SJed Brown } 7018a07c6f1SJed Brown } 7028a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 7038a07c6f1SJed Brown } 7048a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7058a07c6f1SJed Brown for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 7068a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7078a07c6f1SJed Brown ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 7088a07c6f1SJed Brown } 7098a07c6f1SJed Brown } 7108a07c6f1SJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 7118a07c6f1SJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 7128a07c6f1SJed Brown ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 7138a07c6f1SJed Brown 7148a07c6f1SJed Brown /* Column indices are in the list of free space */ 7158a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7168a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 717785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 7188a07c6f1SJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 7198a07c6f1SJed Brown 7208a07c6f1SJed Brown /* put together the new symbolic matrix */ 721ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 72233d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 72302fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 7248a07c6f1SJed Brown 7258a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 7268a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 7278a07c6f1SJed Brown c = (Mat_SeqAIJ*)((*C)->data); 7288a07c6f1SJed Brown c->free_a = PETSC_TRUE; 7298a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 7308a07c6f1SJed Brown c->nonew = 0; 73126fbe8dcSKarl Rupp 732*df97dc6dSFande Kong (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 7338a07c6f1SJed Brown 7348a07c6f1SJed Brown /* set MatInfo */ 7358a07c6f1SJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 7368a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 7378a07c6f1SJed Brown c->maxnz = ci[am]; 7388a07c6f1SJed Brown c->nz = ci[am]; 7398a07c6f1SJed Brown (*C)->info.mallocs = ndouble; 7408a07c6f1SJed Brown (*C)->info.fill_ratio_given = fill; 7418a07c6f1SJed Brown (*C)->info.fill_ratio_needed = afill; 7428a07c6f1SJed Brown 7438a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 7448a07c6f1SJed Brown if (ci[am]) { 74557622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 74657622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 7478a07c6f1SJed Brown } else { 7488a07c6f1SJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 7498a07c6f1SJed Brown } 7508a07c6f1SJed Brown #endif 7518a07c6f1SJed Brown PetscFunctionReturn(0); 7528a07c6f1SJed Brown } 7538a07c6f1SJed Brown 754d7ed1a4dSandi selinger 755d7ed1a4dSandi selinger PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C) 756d7ed1a4dSandi selinger { 757d7ed1a4dSandi selinger PetscErrorCode ierr; 758d7ed1a4dSandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 759d7ed1a4dSandi selinger const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 760d7ed1a4dSandi selinger PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 761d7ed1a4dSandi selinger PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 762d7ed1a4dSandi selinger const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 763d7ed1a4dSandi selinger const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 764d7ed1a4dSandi selinger const PetscInt *brow_ptr[8],*brow_end[8]; 765d7ed1a4dSandi selinger PetscInt window[8]; 766d7ed1a4dSandi selinger PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 767d7ed1a4dSandi selinger PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 768d7ed1a4dSandi selinger PetscReal afill; 769f83700f2Sandi selinger PetscInt *workj_L1,*workj_L2,*workj_L3; 7707660c4dbSandi selinger PetscInt L1_nnz,L2_nnz; 771d7ed1a4dSandi selinger 772d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 773d7ed1a4dSandi selinger Because of the way virtual memory works, 774d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 775d7ed1a4dSandi selinger PetscFunctionBegin; 776d7ed1a4dSandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 777d7ed1a4dSandi selinger for (i=0; i<am; i++) { 778d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 779d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 780d7ed1a4dSandi selinger a_rownnz = 0; 781d7ed1a4dSandi selinger for (k=0; k<anzi; ++k) { 782d7ed1a4dSandi selinger a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 783d7ed1a4dSandi selinger if (a_rownnz > bn) { 784d7ed1a4dSandi selinger a_rownnz = bn; 785d7ed1a4dSandi selinger break; 786d7ed1a4dSandi selinger } 787d7ed1a4dSandi selinger } 788d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 789d7ed1a4dSandi selinger } 790d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 791d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 792f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 793f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr); 794d7ed1a4dSandi selinger 7957660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 7967660c4dbSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 797d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 798d7ed1a4dSandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 799d7ed1a4dSandi selinger 800d7ed1a4dSandi selinger ci_nnz = 0; 801d7ed1a4dSandi selinger ci[0] = 0; 802d7ed1a4dSandi selinger worki_L1[0] = 0; 803d7ed1a4dSandi selinger worki_L2[0] = 0; 804d7ed1a4dSandi selinger for (i=0; i<am; i++) { 805d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 806d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 807d7ed1a4dSandi selinger rowsleft = anzi; 808d7ed1a4dSandi selinger inputcol_L1 = acol; 809d7ed1a4dSandi selinger L2_nnz = 0; 8107660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8117660c4dbSandi selinger worki_L2[1] = 0; 81208fe1b3cSKarl Rupp outputi_nnz = 0; 813d7ed1a4dSandi selinger 814d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 815d7ed1a4dSandi selinger while (ci_nnz+a_maxrownnz > c_maxmem) { 816d7ed1a4dSandi selinger c_maxmem *= 2; 817d7ed1a4dSandi selinger ndouble++; 818d7ed1a4dSandi selinger ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 819d7ed1a4dSandi selinger } 820d7ed1a4dSandi selinger 821d7ed1a4dSandi selinger while (rowsleft) { 822d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 823d7ed1a4dSandi selinger L1_nrows = 0; 8247660c4dbSandi selinger L1_nnz = 0; 825d7ed1a4dSandi selinger inputcol = inputcol_L1; 826d7ed1a4dSandi selinger inputi = bi; 827d7ed1a4dSandi selinger inputj = bj; 828d7ed1a4dSandi selinger 829d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 830d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 831f83700f2Sandi selinger Input: inputj inputi inputcol bn 832d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 833d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 834d7ed1a4dSandi selinger window_min = bn; \ 8357660c4dbSandi selinger outputi_nnz = 0; \ 8367660c4dbSandi selinger for (k=0; k<ANNZ; ++k) { \ 837d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 838d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 839d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 840d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 841d7ed1a4dSandi selinger } \ 842d7ed1a4dSandi selinger while (window_min < bn) { \ 843d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 844d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 845d7ed1a4dSandi selinger old_window_min = window_min; \ 846d7ed1a4dSandi selinger window_min = bn; \ 847d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 848d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 849d7ed1a4dSandi selinger brow_ptr[k]++; \ 850d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 851d7ed1a4dSandi selinger } \ 852d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 853d7ed1a4dSandi selinger } \ 854d7ed1a4dSandi selinger } 855d7ed1a4dSandi selinger 856d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 857d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 858d7ed1a4dSandi selinger while (L1_rowsleft) { 8597660c4dbSandi selinger outputi_nnz = 0; 8607660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 8617660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 8627660c4dbSandi selinger 863d7ed1a4dSandi selinger switch (L1_rowsleft) { 864d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 865d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 866d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 867d7ed1a4dSandi selinger inputcol += L1_rowsleft; 868d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 869d7ed1a4dSandi selinger L1_rowsleft = 0; 870d7ed1a4dSandi selinger break; 871d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); 872d7ed1a4dSandi selinger inputcol += L1_rowsleft; 873d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 874d7ed1a4dSandi selinger L1_rowsleft = 0; 875d7ed1a4dSandi selinger break; 876d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); 877d7ed1a4dSandi selinger inputcol += L1_rowsleft; 878d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 879d7ed1a4dSandi selinger L1_rowsleft = 0; 880d7ed1a4dSandi selinger break; 881d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); 882d7ed1a4dSandi selinger inputcol += L1_rowsleft; 883d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 884d7ed1a4dSandi selinger L1_rowsleft = 0; 885d7ed1a4dSandi selinger break; 886d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); 887d7ed1a4dSandi selinger inputcol += L1_rowsleft; 888d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 889d7ed1a4dSandi selinger L1_rowsleft = 0; 890d7ed1a4dSandi selinger break; 891d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); 892d7ed1a4dSandi selinger inputcol += L1_rowsleft; 893d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 894d7ed1a4dSandi selinger L1_rowsleft = 0; 895d7ed1a4dSandi selinger break; 896d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); 897d7ed1a4dSandi selinger inputcol += L1_rowsleft; 898d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 899d7ed1a4dSandi selinger L1_rowsleft = 0; 900d7ed1a4dSandi selinger break; 901d7ed1a4dSandi selinger default: MatMatMultSymbolic_RowMergeMacro(8); 902d7ed1a4dSandi selinger inputcol += 8; 903d7ed1a4dSandi selinger rowsleft -= 8; 904d7ed1a4dSandi selinger L1_rowsleft -= 8; 905d7ed1a4dSandi selinger break; 906d7ed1a4dSandi selinger } 907d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9087660c4dbSandi selinger L1_nnz += outputi_nnz; 9097660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 910d7ed1a4dSandi selinger } 911d7ed1a4dSandi selinger 912d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 913d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 914d7ed1a4dSandi selinger if (anzi > 8) { 915d7ed1a4dSandi selinger inputi = worki_L1; 916d7ed1a4dSandi selinger inputj = workj_L1; 917d7ed1a4dSandi selinger inputcol = workcol; 918d7ed1a4dSandi selinger outputi_nnz = 0; 919d7ed1a4dSandi selinger 920d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 921d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 922d7ed1a4dSandi selinger 923d7ed1a4dSandi selinger switch (L1_nrows) { 924d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 925d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 926d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 927d7ed1a4dSandi selinger break; 928d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 929d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 930d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 931d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 932d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 933d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 934d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 935d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 936d7ed1a4dSandi selinger } 937d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 938d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 939d7ed1a4dSandi selinger 9407660c4dbSandi selinger /************************ L E V E L 3 **********************/ 9417660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 942d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 943d7ed1a4dSandi selinger inputi = worki_L2; 944d7ed1a4dSandi selinger inputj = workj_L2; 945d7ed1a4dSandi selinger inputcol = workcol; 946d7ed1a4dSandi selinger outputi_nnz = 0; 947f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 948d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 949d7ed1a4dSandi selinger switch (L2_nrows) { 950d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 951d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 952d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 953d7ed1a4dSandi selinger break; 954d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 955d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 956d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 957d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 958d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 959d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 960d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 961d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 962d7ed1a4dSandi selinger } 963d7ed1a4dSandi selinger L2_nrows = 1; 9647660c4dbSandi selinger L2_nnz = outputi_nnz; 9657660c4dbSandi selinger worki_L2[1] = outputi_nnz; 9667660c4dbSandi selinger /* Copy to workj_L2 */ 967d7ed1a4dSandi selinger if (rowsleft) { 9687660c4dbSandi selinger for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 969d7ed1a4dSandi selinger } 970d7ed1a4dSandi selinger } 971d7ed1a4dSandi selinger } 972d7ed1a4dSandi selinger } /* while (rowsleft) */ 973d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 974d7ed1a4dSandi selinger 975d7ed1a4dSandi selinger /* terminate current row */ 976d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 977d7ed1a4dSandi selinger ci[i+1] = ci_nnz; 978d7ed1a4dSandi selinger } 979d7ed1a4dSandi selinger 980d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 981d7ed1a4dSandi selinger ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 982d7ed1a4dSandi selinger ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 983f83700f2Sandi selinger ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 984d7ed1a4dSandi selinger 985d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 986d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 987d7ed1a4dSandi selinger c = (Mat_SeqAIJ*)((*C)->data); 988d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 989d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 990d7ed1a4dSandi selinger c->nonew = 0; 991d7ed1a4dSandi selinger 992*df97dc6dSFande Kong (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 993d7ed1a4dSandi selinger 994d7ed1a4dSandi selinger /* set MatInfo */ 995d7ed1a4dSandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 996d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 997d7ed1a4dSandi selinger c->maxnz = ci[am]; 998d7ed1a4dSandi selinger c->nz = ci[am]; 999d7ed1a4dSandi selinger (*C)->info.mallocs = ndouble; 1000d7ed1a4dSandi selinger (*C)->info.fill_ratio_given = fill; 1001d7ed1a4dSandi selinger (*C)->info.fill_ratio_needed = afill; 1002d7ed1a4dSandi selinger 1003d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1004d7ed1a4dSandi selinger if (ci[am]) { 1005d7ed1a4dSandi selinger ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 1006d7ed1a4dSandi selinger ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1007d7ed1a4dSandi selinger } else { 1008d7ed1a4dSandi selinger ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 1009d7ed1a4dSandi selinger } 1010d7ed1a4dSandi selinger #endif 1011d7ed1a4dSandi selinger 1012d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 1013d7ed1a4dSandi selinger ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1014d7ed1a4dSandi selinger ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1015f83700f2Sandi selinger ierr = PetscFree(workj_L3);CHKERRQ(ierr); 1016d7ed1a4dSandi selinger PetscFunctionReturn(0); 1017d7ed1a4dSandi selinger } 1018d7ed1a4dSandi selinger 1019cd093f1dSJed Brown /* concatenate unique entries and then sort */ 1020*df97dc6dSFande Kong PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat *C) 1021cd093f1dSJed Brown { 1022cd093f1dSJed Brown PetscErrorCode ierr; 1023cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1024cd093f1dSJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 1025cd093f1dSJed Brown PetscInt *ci,*cj; 1026cd093f1dSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1027cd093f1dSJed Brown PetscReal afill; 1028cd093f1dSJed Brown PetscInt i,j,ndouble = 0; 1029cd093f1dSJed Brown PetscSegBuffer seg,segrow; 1030cd093f1dSJed Brown char *seen; 1031cd093f1dSJed Brown 1032cd093f1dSJed Brown PetscFunctionBegin; 1033854ce69bSBarry Smith ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1034cd093f1dSJed Brown ci[0] = 0; 1035cd093f1dSJed Brown 1036cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1037cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1038cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1039785e854fSJed Brown ierr = PetscMalloc1(bn,&seen);CHKERRQ(ierr); 1040cd093f1dSJed Brown ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr); 1041cd093f1dSJed Brown 1042cd093f1dSJed Brown /* Determine ci and cj */ 1043cd093f1dSJed Brown for (i=0; i<am; i++) { 1044cd093f1dSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1045cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1046cd093f1dSJed Brown PetscInt packlen = 0,*PETSC_RESTRICT crow; 1047cd093f1dSJed Brown /* Pack segrow */ 1048cd093f1dSJed Brown for (j=0; j<anzi; j++) { 1049cd093f1dSJed Brown PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1050cd093f1dSJed Brown for (k=bjstart; k<bjend; k++) { 1051cd093f1dSJed Brown PetscInt bcol = bj[k]; 1052cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1053cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 1054cd093f1dSJed Brown ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1055cd093f1dSJed Brown *slot = bcol; 1056cd093f1dSJed Brown seen[bcol] = 1; 1057cd093f1dSJed Brown packlen++; 1058cd093f1dSJed Brown } 1059cd093f1dSJed Brown } 1060cd093f1dSJed Brown } 1061cd093f1dSJed Brown ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1062cd093f1dSJed Brown ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1063cd093f1dSJed Brown ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1064cd093f1dSJed Brown ci[i+1] = ci[i] + packlen; 1065cd093f1dSJed Brown for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1066cd093f1dSJed Brown } 1067cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1068cd093f1dSJed Brown ierr = PetscFree(seen);CHKERRQ(ierr); 1069cd093f1dSJed Brown 1070cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 1071cd093f1dSJed Brown ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1072cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1073cd093f1dSJed Brown 1074cd093f1dSJed Brown /* put together the new symbolic matrix */ 1075cd093f1dSJed Brown ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 107633d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 107702fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1078cd093f1dSJed Brown 1079cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1080cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1081cd093f1dSJed Brown c = (Mat_SeqAIJ*)((*C)->data); 1082cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1083cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1084cd093f1dSJed Brown c->nonew = 0; 1085cd093f1dSJed Brown 1086*df97dc6dSFande Kong (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1087cd093f1dSJed Brown 1088cd093f1dSJed Brown /* set MatInfo */ 1089cd093f1dSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1090cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 1091cd093f1dSJed Brown c->maxnz = ci[am]; 1092cd093f1dSJed Brown c->nz = ci[am]; 1093cd093f1dSJed Brown (*C)->info.mallocs = ndouble; 1094cd093f1dSJed Brown (*C)->info.fill_ratio_given = fill; 1095cd093f1dSJed Brown (*C)->info.fill_ratio_needed = afill; 1096cd093f1dSJed Brown 1097cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1098cd093f1dSJed Brown if (ci[am]) { 109957622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 110057622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1101cd093f1dSJed Brown } else { 1102cd093f1dSJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 1103cd093f1dSJed Brown } 1104cd093f1dSJed Brown #endif 1105cd093f1dSJed Brown PetscFunctionReturn(0); 1106cd093f1dSJed Brown } 1107cd093f1dSJed Brown 1108d2853540SHong Zhang /* This routine is not used. Should be removed! */ 11096fc122caSHong Zhang PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 11105df89d91SHong Zhang { 1111bc011b1eSHong Zhang PetscErrorCode ierr; 1112bc011b1eSHong Zhang 1113bc011b1eSHong Zhang PetscFunctionBegin; 1114bc011b1eSHong Zhang if (scall == MAT_INITIAL_MATRIX) { 11153ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr); 11166fc122caSHong Zhang ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 11173ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr); 1118bc011b1eSHong Zhang } 11193ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr); 11206fc122caSHong Zhang ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr); 11213ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr); 1122bc011b1eSHong Zhang PetscFunctionReturn(0); 1123bc011b1eSHong Zhang } 1124bc011b1eSHong Zhang 11252128a86cSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A) 11262128a86cSHong Zhang { 11272128a86cSHong Zhang PetscErrorCode ierr; 11284c7df5ccSHong Zhang Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 112940192850SHong Zhang Mat_MatMatTransMult *abt=a->abt; 11302128a86cSHong Zhang 11312128a86cSHong Zhang PetscFunctionBegin; 113240192850SHong Zhang ierr = (abt->destroy)(A);CHKERRQ(ierr); 113340192850SHong Zhang ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 113440192850SHong Zhang ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 113540192850SHong Zhang ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 113640192850SHong Zhang ierr = PetscFree(abt);CHKERRQ(ierr); 11372128a86cSHong Zhang PetscFunctionReturn(0); 11382128a86cSHong Zhang } 11392128a86cSHong Zhang 11406fc122caSHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 1141bc011b1eSHong Zhang { 11425df89d91SHong Zhang PetscErrorCode ierr; 114381d82fe4SHong Zhang Mat Bt; 114481d82fe4SHong Zhang PetscInt *bti,*btj; 114540192850SHong Zhang Mat_MatMatTransMult *abt; 11464c7df5ccSHong Zhang Mat_SeqAIJ *c; 1147d2853540SHong Zhang 114881d82fe4SHong Zhang PetscFunctionBegin; 114981d82fe4SHong Zhang /* create symbolic Bt */ 115081d82fe4SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 11510298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 115233d57670SJed Brown ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 115304b858e0SBarry Smith ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 115481d82fe4SHong Zhang 115581d82fe4SHong Zhang /* get symbolic C=A*Bt */ 115681d82fe4SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 115781d82fe4SHong Zhang 11582128a86cSHong Zhang /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */ 1159b00a9115SJed Brown ierr = PetscNew(&abt);CHKERRQ(ierr); 11604c7df5ccSHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 116140192850SHong Zhang c->abt = abt; 11622128a86cSHong Zhang 116340192850SHong Zhang abt->usecoloring = PETSC_FALSE; 116440192850SHong Zhang abt->destroy = (*C)->ops->destroy; 11652128a86cSHong Zhang (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 11662128a86cSHong Zhang 1167c5929fdfSBarry Smith ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr); 116840192850SHong Zhang if (abt->usecoloring) { 1169b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1170b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1171335efc43SPeter Brune MatColoring coloring; 11722128a86cSHong Zhang ISColoring iscoloring; 11732128a86cSHong Zhang Mat Bt_dense,C_dense; 11744d478ae7SHong Zhang Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data; 11754d478ae7SHong Zhang /* inode causes memory problem, don't know why */ 11764d478ae7SHong Zhang if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'"); 1177e8354b3eSHong Zhang 1178335efc43SPeter Brune ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr); 1179335efc43SPeter Brune ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1180335efc43SPeter Brune ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1181335efc43SPeter Brune ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1182335efc43SPeter Brune ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1183335efc43SPeter Brune ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 1184b9af6bddSHong Zhang ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr); 11852205254eSKarl Rupp 118640192850SHong Zhang abt->matcoloring = matcoloring; 11872205254eSKarl Rupp 11882128a86cSHong Zhang ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 11892128a86cSHong Zhang 11902128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 11912128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 11922128a86cSHong Zhang ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 11932128a86cSHong Zhang ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 11940298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 11952205254eSKarl Rupp 11962128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 119740192850SHong Zhang abt->Bt_den = Bt_dense; 11982128a86cSHong Zhang 11992128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 12002128a86cSHong Zhang ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12012128a86cSHong Zhang ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 12020298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 12032205254eSKarl Rupp 12042128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 120540192850SHong Zhang abt->ABt_den = C_dense; 1206f94ccd6cSHong Zhang 1207f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12081ce71dffSSatish Balay { 1209f94ccd6cSHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data; 1210c40ebe3bSHong Zhang ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr); 12111ce71dffSSatish Balay } 1212f94ccd6cSHong Zhang #endif 12132128a86cSHong Zhang } 1214e99be685SHong Zhang /* clean up */ 1215e99be685SHong Zhang ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1216e99be685SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12175df89d91SHong Zhang PetscFunctionReturn(0); 12185df89d91SHong Zhang } 12195df89d91SHong Zhang 12206fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 12215df89d91SHong Zhang { 12225df89d91SHong Zhang PetscErrorCode ierr; 12235df89d91SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1224e2cac8caSJed Brown PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 122581d82fe4SHong Zhang PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 12265df89d91SHong Zhang PetscLogDouble flops=0.0; 1227aa1aec99SHong Zhang MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 122840192850SHong Zhang Mat_MatMatTransMult *abt = c->abt; 12295df89d91SHong Zhang 12305df89d91SHong Zhang PetscFunctionBegin; 123158ed3ceaSHong Zhang /* clear old values in C */ 123258ed3ceaSHong Zhang if (!c->a) { 1233854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 123458ed3ceaSHong Zhang c->a = ca; 123558ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 123658ed3ceaSHong Zhang } else { 123758ed3ceaSHong Zhang ca = c->a; 123858ed3ceaSHong Zhang } 123958ed3ceaSHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 124058ed3ceaSHong Zhang 124140192850SHong Zhang if (abt->usecoloring) { 124240192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 124340192850SHong Zhang Mat Bt_dense,C_dense = abt->ABt_den; 1244c8db22f6SHong Zhang 1245b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 124640192850SHong Zhang Bt_dense = abt->Bt_den; 1247b9af6bddSHong Zhang ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1248c8db22f6SHong Zhang 1249c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 12502128a86cSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1251c8db22f6SHong Zhang 12522128a86cSHong Zhang /* Recover C from C_dense */ 1253b9af6bddSHong Zhang ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1254c8db22f6SHong Zhang PetscFunctionReturn(0); 1255c8db22f6SHong Zhang } 1256c8db22f6SHong Zhang 12571fa1209cSHong Zhang for (i=0; i<cm; i++) { 125881d82fe4SHong Zhang anzi = ai[i+1] - ai[i]; 125981d82fe4SHong Zhang acol = aj + ai[i]; 12606973769bSHong Zhang aval = aa + ai[i]; 12611fa1209cSHong Zhang cnzi = ci[i+1] - ci[i]; 12621fa1209cSHong Zhang ccol = cj + ci[i]; 12636973769bSHong Zhang cval = ca + ci[i]; 12641fa1209cSHong Zhang for (j=0; j<cnzi; j++) { 126581d82fe4SHong Zhang brow = ccol[j]; 126681d82fe4SHong Zhang bnzj = bi[brow+1] - bi[brow]; 126781d82fe4SHong Zhang bcol = bj + bi[brow]; 12686973769bSHong Zhang bval = ba + bi[brow]; 12696973769bSHong Zhang 127081d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 127181d82fe4SHong Zhang nexta = 0; nextb = 0; 127281d82fe4SHong Zhang while (nexta<anzi && nextb<bnzj) { 12737b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 127481d82fe4SHong Zhang if (nexta == anzi) break; 12757b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 127681d82fe4SHong Zhang if (nextb == bnzj) break; 127781d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 12786973769bSHong Zhang cval[j] += aval[nexta]*bval[nextb]; 127981d82fe4SHong Zhang nexta++; nextb++; 128081d82fe4SHong Zhang flops += 2; 12811fa1209cSHong Zhang } 12821fa1209cSHong Zhang } 128381d82fe4SHong Zhang } 128481d82fe4SHong Zhang } 128581d82fe4SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 128681d82fe4SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 128781d82fe4SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 12885df89d91SHong Zhang PetscFunctionReturn(0); 12895df89d91SHong Zhang } 12905df89d91SHong Zhang 12916d373c3eSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A) 12926d373c3eSHong Zhang { 12936d373c3eSHong Zhang PetscErrorCode ierr; 12946d373c3eSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 12956d373c3eSHong Zhang Mat_MatTransMatMult *atb = a->atb; 12966d373c3eSHong Zhang 12976d373c3eSHong Zhang PetscFunctionBegin; 12986473ade5SStefano Zampini if (atb) { 12996d373c3eSHong Zhang ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 13006473ade5SStefano Zampini ierr = (*atb->destroy)(A);CHKERRQ(ierr); 13016473ade5SStefano Zampini } 13026d373c3eSHong Zhang ierr = PetscFree(atb);CHKERRQ(ierr); 13036d373c3eSHong Zhang PetscFunctionReturn(0); 13046d373c3eSHong Zhang } 13056d373c3eSHong Zhang 13060adebc6cSBarry Smith PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 13070adebc6cSBarry Smith { 13085df89d91SHong Zhang PetscErrorCode ierr; 13096d373c3eSHong Zhang const char *algTypes[2] = {"matmatmult","outerproduct"}; 13106d373c3eSHong Zhang PetscInt alg=0; /* set default algorithm */ 13116d373c3eSHong Zhang Mat At; 13126d373c3eSHong Zhang Mat_MatTransMatMult *atb; 13136d373c3eSHong Zhang Mat_SeqAIJ *c; 13145df89d91SHong Zhang 13155df89d91SHong Zhang PetscFunctionBegin; 13165df89d91SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 1317715a5346SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 13186d373c3eSHong Zhang ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr); 13196d373c3eSHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 13206d373c3eSHong Zhang 13216d373c3eSHong Zhang switch (alg) { 13226d373c3eSHong Zhang case 1: 132375648e8dSHong Zhang ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 13246d373c3eSHong Zhang break; 13256d373c3eSHong Zhang default: 13266d373c3eSHong Zhang ierr = PetscNew(&atb);CHKERRQ(ierr); 13276d373c3eSHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 13286d373c3eSHong Zhang ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr); 13296d373c3eSHong Zhang 1330618cf492SHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 13316d373c3eSHong Zhang c->atb = atb; 13326d373c3eSHong Zhang atb->At = At; 13336d373c3eSHong Zhang atb->destroy = (*C)->ops->destroy; 13346d373c3eSHong Zhang (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 13356d373c3eSHong Zhang 13366d373c3eSHong Zhang break; 13375df89d91SHong Zhang } 13386d373c3eSHong Zhang } 13396d373c3eSHong Zhang if (alg) { 13406d373c3eSHong Zhang ierr = (*(*C)->ops->mattransposemultnumeric)(A,B,*C);CHKERRQ(ierr); 13416d373c3eSHong Zhang } else if (!alg && scall == MAT_REUSE_MATRIX) { 13426d373c3eSHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 13436d373c3eSHong Zhang atb = c->atb; 13446d373c3eSHong Zhang At = atb->At; 13456d373c3eSHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 13466d373c3eSHong Zhang ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);CHKERRQ(ierr); 13476d373c3eSHong Zhang } 13485df89d91SHong Zhang PetscFunctionReturn(0); 13495df89d91SHong Zhang } 13505df89d91SHong Zhang 135175648e8dSHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 13525df89d91SHong Zhang { 1353bc011b1eSHong Zhang PetscErrorCode ierr; 1354bc011b1eSHong Zhang Mat At; 135538baddfdSBarry Smith PetscInt *ati,*atj; 1356bc011b1eSHong Zhang 1357bc011b1eSHong Zhang PetscFunctionBegin; 1358bc011b1eSHong Zhang /* create symbolic At */ 1359bc011b1eSHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13600298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 136133d57670SJed Brown ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 136204b858e0SBarry Smith ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1363bc011b1eSHong Zhang 1364bc011b1eSHong Zhang /* get symbolic C=At*B */ 1365bc011b1eSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr); 1366bc011b1eSHong Zhang 1367bc011b1eSHong Zhang /* clean up */ 13686bf464f9SBarry Smith ierr = MatDestroy(&At);CHKERRQ(ierr); 1369bc011b1eSHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13706d373c3eSHong Zhang 13716d373c3eSHong Zhang (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; 1372bc011b1eSHong Zhang PetscFunctionReturn(0); 1373bc011b1eSHong Zhang } 1374bc011b1eSHong Zhang 137575648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1376bc011b1eSHong Zhang { 1377bc011b1eSHong Zhang PetscErrorCode ierr; 13780fbc74f4SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1379d0f46423SBarry Smith PetscInt am =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1380d0f46423SBarry Smith PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1381d13dce4bSSatish Balay PetscLogDouble flops=0.0; 1382aa1aec99SHong Zhang MatScalar *aa =a->a,*ba,*ca,*caj; 1383bc011b1eSHong Zhang 1384bc011b1eSHong Zhang PetscFunctionBegin; 1385aa1aec99SHong Zhang if (!c->a) { 1386854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 13872205254eSKarl Rupp 1388aa1aec99SHong Zhang c->a = ca; 1389aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1390aa1aec99SHong Zhang } else { 1391aa1aec99SHong Zhang ca = c->a; 1392aa1aec99SHong Zhang } 1393bc011b1eSHong Zhang /* clear old values in C */ 1394bc011b1eSHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 1395bc011b1eSHong Zhang 1396bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1397bc011b1eSHong Zhang for (i=0; i<am; i++) { 1398bc011b1eSHong Zhang bj = b->j + bi[i]; 1399bc011b1eSHong Zhang ba = b->a + bi[i]; 1400bc011b1eSHong Zhang bnzi = bi[i+1] - bi[i]; 1401bc011b1eSHong Zhang anzi = ai[i+1] - ai[i]; 1402bc011b1eSHong Zhang for (j=0; j<anzi; j++) { 1403bc011b1eSHong Zhang nextb = 0; 14040fbc74f4SHong Zhang crow = *aj++; 1405bc011b1eSHong Zhang cjj = cj + ci[crow]; 1406bc011b1eSHong Zhang caj = ca + ci[crow]; 1407bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1408bc011b1eSHong Zhang for (k=0; nextb<bnzi; k++) { 14090fbc74f4SHong Zhang if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 14100fbc74f4SHong Zhang caj[k] += (*aa)*(*(ba+nextb)); 1411bc011b1eSHong Zhang nextb++; 1412bc011b1eSHong Zhang } 1413bc011b1eSHong Zhang } 1414bc011b1eSHong Zhang flops += 2*bnzi; 14150fbc74f4SHong Zhang aa++; 1416bc011b1eSHong Zhang } 1417bc011b1eSHong Zhang } 1418bc011b1eSHong Zhang 1419bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 1420bc011b1eSHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1421bc011b1eSHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1422bc011b1eSHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1423bc011b1eSHong Zhang PetscFunctionReturn(0); 1424bc011b1eSHong Zhang } 14259123193aSHong Zhang 1426150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 14279123193aSHong Zhang { 14289123193aSHong Zhang PetscErrorCode ierr; 14299123193aSHong Zhang 14309123193aSHong Zhang PetscFunctionBegin; 14319123193aSHong Zhang if (scall == MAT_INITIAL_MATRIX) { 14323ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 14339123193aSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr); 14343ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 14359123193aSHong Zhang } 14363ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 14379123193aSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr); 14384614e006SHong Zhang ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 14399123193aSHong Zhang PetscFunctionReturn(0); 14409123193aSHong Zhang } 1441edd81eecSMatthew Knepley 14429123193aSHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C) 14439123193aSHong Zhang { 14449123193aSHong Zhang PetscErrorCode ierr; 14459123193aSHong Zhang 14469123193aSHong Zhang PetscFunctionBegin; 14475a586d82SBarry Smith ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 14482205254eSKarl Rupp 1449d73949e8SHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 14509123193aSHong Zhang PetscFunctionReturn(0); 14519123193aSHong Zhang } 14529123193aSHong Zhang 14539123193aSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 14549123193aSHong Zhang { 1455f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1456612438f1SStefano Zampini Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 14579123193aSHong Zhang PetscErrorCode ierr; 1458daf57211SHong Zhang PetscScalar *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp; 1459daf57211SHong Zhang const PetscScalar *aa,*b1,*b2,*b3,*b4; 1460daf57211SHong Zhang const PetscInt *aj; 1461612438f1SStefano Zampini PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 1462daf57211SHong Zhang PetscInt am4=4*am,bm4=4*bm,col,i,j,n,ajtmp; 14639123193aSHong Zhang 14649123193aSHong Zhang PetscFunctionBegin; 1465f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 1466612438f1SStefano Zampini if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 1467e32f2f54SBarry Smith if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 1468e32f2f54SBarry Smith if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 1469612438f1SStefano Zampini b = bd->v; 14708c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1471f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 1472730858b9SHong Zhang c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am; 1473f32d5d43SBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 1474f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1475f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1476f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1477f32d5d43SBarry Smith aj = a->j + a->i[i]; 1478f32d5d43SBarry Smith aa = a->a + a->i[i]; 1479f32d5d43SBarry Smith for (j=0; j<n; j++) { 1480730858b9SHong Zhang aatmp = aa[j]; ajtmp = aj[j]; 1481730858b9SHong Zhang r1 += aatmp*b1[ajtmp]; 1482730858b9SHong Zhang r2 += aatmp*b2[ajtmp]; 1483730858b9SHong Zhang r3 += aatmp*b3[ajtmp]; 1484730858b9SHong Zhang r4 += aatmp*b4[ajtmp]; 14859123193aSHong Zhang } 1486730858b9SHong Zhang c1[i] = r1; 1487730858b9SHong Zhang c2[i] = r2; 1488730858b9SHong Zhang c3[i] = r3; 1489730858b9SHong Zhang c4[i] = r4; 1490f32d5d43SBarry Smith } 1491730858b9SHong Zhang b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1492730858b9SHong Zhang c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1493f32d5d43SBarry Smith } 1494f32d5d43SBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 1495f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1496f32d5d43SBarry Smith r1 = 0.0; 1497f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1498f32d5d43SBarry Smith aj = a->j + a->i[i]; 1499f32d5d43SBarry Smith aa = a->a + a->i[i]; 1500f32d5d43SBarry Smith for (j=0; j<n; j++) { 1501730858b9SHong Zhang r1 += aa[j]*b1[aj[j]]; 1502f32d5d43SBarry Smith } 1503730858b9SHong Zhang c1[i] = r1; 1504f32d5d43SBarry Smith } 1505f32d5d43SBarry Smith b1 += bm; 1506730858b9SHong Zhang c1 += am; 1507f32d5d43SBarry Smith } 1508dc0b31edSSatish Balay ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 15098c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1510f32d5d43SBarry Smith ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1511f32d5d43SBarry Smith ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1512f32d5d43SBarry Smith PetscFunctionReturn(0); 1513f32d5d43SBarry Smith } 1514f32d5d43SBarry Smith 1515f32d5d43SBarry Smith /* 15164324174eSBarry Smith Note very similar to MatMult_SeqAIJ(), should generate both codes from same base 1517f32d5d43SBarry Smith */ 1518f32d5d43SBarry Smith PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1519f32d5d43SBarry Smith { 1520f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 152190f5ea3eSStefano Zampini Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1522f32d5d43SBarry Smith PetscErrorCode ierr; 1523dd6ea824SBarry Smith PetscScalar *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4; 1524dd6ea824SBarry Smith MatScalar *aa; 152590f5ea3eSStefano Zampini PetscInt cm = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm; 15264324174eSBarry Smith PetscInt am2 = 2*am, am3 = 3*am, bm4 = 4*bm,colam,*ridx; 1527f32d5d43SBarry Smith 1528f32d5d43SBarry Smith PetscFunctionBegin; 1529f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 153090f5ea3eSStefano Zampini b = bd->v; 15318c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1532f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 15334324174eSBarry Smith 15344324174eSBarry Smith if (a->compressedrow.use) { /* use compressed row format */ 15354324174eSBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 15364324174eSBarry Smith colam = col*am; 15374324174eSBarry Smith arm = a->compressedrow.nrows; 15384324174eSBarry Smith ii = a->compressedrow.i; 15394324174eSBarry Smith ridx = a->compressedrow.rindex; 15404324174eSBarry Smith for (i=0; i<arm; i++) { /* over rows of C in those columns */ 15414324174eSBarry Smith r1 = r2 = r3 = r4 = 0.0; 15424324174eSBarry Smith n = ii[i+1] - ii[i]; 15434324174eSBarry Smith aj = a->j + ii[i]; 15444324174eSBarry Smith aa = a->a + ii[i]; 15454324174eSBarry Smith for (j=0; j<n; j++) { 15464324174eSBarry Smith r1 += (*aa)*b1[*aj]; 15474324174eSBarry Smith r2 += (*aa)*b2[*aj]; 15484324174eSBarry Smith r3 += (*aa)*b3[*aj]; 15494324174eSBarry Smith r4 += (*aa++)*b4[*aj++]; 15504324174eSBarry Smith } 15514324174eSBarry Smith c[colam + ridx[i]] += r1; 15524324174eSBarry Smith c[colam + am + ridx[i]] += r2; 15534324174eSBarry Smith c[colam + am2 + ridx[i]] += r3; 15544324174eSBarry Smith c[colam + am3 + ridx[i]] += r4; 15554324174eSBarry Smith } 15564324174eSBarry Smith b1 += bm4; 15574324174eSBarry Smith b2 += bm4; 15584324174eSBarry Smith b3 += bm4; 15594324174eSBarry Smith b4 += bm4; 15604324174eSBarry Smith } 15614324174eSBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 15624324174eSBarry Smith colam = col*am; 15634324174eSBarry Smith arm = a->compressedrow.nrows; 15644324174eSBarry Smith ii = a->compressedrow.i; 15654324174eSBarry Smith ridx = a->compressedrow.rindex; 15664324174eSBarry Smith for (i=0; i<arm; i++) { /* over rows of C in those columns */ 15674324174eSBarry Smith r1 = 0.0; 15684324174eSBarry Smith n = ii[i+1] - ii[i]; 15694324174eSBarry Smith aj = a->j + ii[i]; 15704324174eSBarry Smith aa = a->a + ii[i]; 15714324174eSBarry Smith 15724324174eSBarry Smith for (j=0; j<n; j++) { 15734324174eSBarry Smith r1 += (*aa++)*b1[*aj++]; 15744324174eSBarry Smith } 1575a2ea699eSBarry Smith c[colam + ridx[i]] += r1; 15764324174eSBarry Smith } 15774324174eSBarry Smith b1 += bm; 15784324174eSBarry Smith } 15794324174eSBarry Smith } else { 1580f32d5d43SBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 1581f32d5d43SBarry Smith colam = col*am; 1582f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1583f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1584f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1585f32d5d43SBarry Smith aj = a->j + a->i[i]; 1586f32d5d43SBarry Smith aa = a->a + a->i[i]; 1587f32d5d43SBarry Smith for (j=0; j<n; j++) { 1588f32d5d43SBarry Smith r1 += (*aa)*b1[*aj]; 1589f32d5d43SBarry Smith r2 += (*aa)*b2[*aj]; 1590f32d5d43SBarry Smith r3 += (*aa)*b3[*aj]; 1591f32d5d43SBarry Smith r4 += (*aa++)*b4[*aj++]; 1592f32d5d43SBarry Smith } 1593f32d5d43SBarry Smith c[colam + i] += r1; 1594f32d5d43SBarry Smith c[colam + am + i] += r2; 1595f32d5d43SBarry Smith c[colam + am2 + i] += r3; 1596f32d5d43SBarry Smith c[colam + am3 + i] += r4; 1597f32d5d43SBarry Smith } 1598f32d5d43SBarry Smith b1 += bm4; 1599f32d5d43SBarry Smith b2 += bm4; 1600f32d5d43SBarry Smith b3 += bm4; 1601f32d5d43SBarry Smith b4 += bm4; 1602f32d5d43SBarry Smith } 1603f32d5d43SBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 1604a2ea699eSBarry Smith colam = col*am; 1605f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1606f32d5d43SBarry Smith r1 = 0.0; 1607f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1608f32d5d43SBarry Smith aj = a->j + a->i[i]; 1609f32d5d43SBarry Smith aa = a->a + a->i[i]; 1610f32d5d43SBarry Smith 1611f32d5d43SBarry Smith for (j=0; j<n; j++) { 1612f32d5d43SBarry Smith r1 += (*aa++)*b1[*aj++]; 1613f32d5d43SBarry Smith } 1614a2ea699eSBarry Smith c[colam + i] += r1; 1615f32d5d43SBarry Smith } 1616f32d5d43SBarry Smith b1 += bm; 1617f32d5d43SBarry Smith } 16184324174eSBarry Smith } 1619dc0b31edSSatish Balay ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr); 16208c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 16219123193aSHong Zhang PetscFunctionReturn(0); 16229123193aSHong Zhang } 1623b1683b59SHong Zhang 1624b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1625c8db22f6SHong Zhang { 1626c8db22f6SHong Zhang PetscErrorCode ierr; 16272f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 16282f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 16292f5f1f90SHong Zhang PetscInt *bi = b->i,*bj=b->j; 16302f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 16312f5f1f90SHong Zhang MatScalar *btval,*btval_den,*ba=b->a; 16322f5f1f90SHong Zhang PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1633c8db22f6SHong Zhang 1634c8db22f6SHong Zhang PetscFunctionBegin; 16352f5f1f90SHong Zhang btval_den=btdense->v; 16362f5f1f90SHong Zhang ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr); 16372f5f1f90SHong Zhang for (k=0; k<ncolors; k++) { 16382f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 16392f5f1f90SHong Zhang for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1640d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 16412f5f1f90SHong Zhang btcol = bj + bi[col]; 16422f5f1f90SHong Zhang btval = ba + bi[col]; 16432f5f1f90SHong Zhang anz = bi[col+1] - bi[col]; 1644d2853540SHong Zhang for (j=0; j<anz; j++) { 16452f5f1f90SHong Zhang brow = btcol[j]; 16462f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1647c8db22f6SHong Zhang } 1648c8db22f6SHong Zhang } 16492f5f1f90SHong Zhang btval_den += m; 1650c8db22f6SHong Zhang } 1651c8db22f6SHong Zhang PetscFunctionReturn(0); 1652c8db22f6SHong Zhang } 1653c8db22f6SHong Zhang 1654b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 16558972f759SHong Zhang { 16568972f759SHong Zhang PetscErrorCode ierr; 1657b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 1658077f23c2SHong Zhang PetscScalar *ca_den,*ca_den_ptr,*ca=csp->a; 1659f99a636bSHong Zhang PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1660e88777f2SHong Zhang PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1661077f23c2SHong Zhang PetscInt nrows,*row,*idx; 1662077f23c2SHong Zhang PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 16638972f759SHong Zhang 16648972f759SHong Zhang PetscFunctionBegin; 1665a3fe58edSHong Zhang ierr = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr); 1666f99a636bSHong Zhang 1667077f23c2SHong Zhang if (brows > 0) { 1668077f23c2SHong Zhang PetscInt *lstart,row_end,row_start; 1669980ae229SHong Zhang lstart = matcoloring->lstart; 1670eeb4fd02SHong Zhang ierr = PetscMemzero(lstart,ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1671eeb4fd02SHong Zhang 1672077f23c2SHong Zhang row_end = brows; 1673eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1674077f23c2SHong Zhang for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1675077f23c2SHong Zhang ca_den_ptr = ca_den; 1676980ae229SHong Zhang for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1677eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1678eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1679eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1680eeb4fd02SHong Zhang for (l=lstart[k]; l<nrows; l++) { 1681eeb4fd02SHong Zhang if (row[l] >= row_end) { 1682eeb4fd02SHong Zhang lstart[k] = l; 1683eeb4fd02SHong Zhang break; 1684eeb4fd02SHong Zhang } else { 1685077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1686eeb4fd02SHong Zhang } 1687eeb4fd02SHong Zhang } 1688077f23c2SHong Zhang ca_den_ptr += m; 1689eeb4fd02SHong Zhang } 1690077f23c2SHong Zhang row_end += brows; 1691eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1692eeb4fd02SHong Zhang } 1693077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1694077f23c2SHong Zhang ca_den_ptr = ca_den; 1695077f23c2SHong Zhang for (k=0; k<ncolors; k++) { 1696077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1697077f23c2SHong Zhang row = rows + colorforrow[k]; 1698077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1699077f23c2SHong Zhang for (l=0; l<nrows; l++) { 1700077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1701077f23c2SHong Zhang } 1702077f23c2SHong Zhang ca_den_ptr += m; 1703077f23c2SHong Zhang } 1704f99a636bSHong Zhang } 1705f99a636bSHong Zhang 1706a3fe58edSHong Zhang ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr); 1707f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1708077f23c2SHong Zhang if (matcoloring->brows > 0) { 1709f99a636bSHong Zhang ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr); 1710e88777f2SHong Zhang } else { 1711077f23c2SHong Zhang ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1712e88777f2SHong Zhang } 1713f99a636bSHong Zhang #endif 17148972f759SHong Zhang PetscFunctionReturn(0); 17158972f759SHong Zhang } 17168972f759SHong Zhang 1717b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1718b1683b59SHong Zhang { 1719b1683b59SHong Zhang PetscErrorCode ierr; 1720e88777f2SHong Zhang PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 17211a83f524SJed Brown const PetscInt *is,*ci,*cj,*row_idx; 1722b28d80bdSHong Zhang PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1723b1683b59SHong Zhang IS *isa; 1724b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1725e88777f2SHong Zhang PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1726e88777f2SHong Zhang PetscInt *colorforcol,*columns,*columns_i,brows; 1727e88777f2SHong Zhang PetscBool flg; 1728b1683b59SHong Zhang 1729b1683b59SHong Zhang PetscFunctionBegin; 1730b1683b59SHong Zhang ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1731e99be685SHong Zhang 17327ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1733e88777f2SHong Zhang Nbs = mat->cmap->N/bs; 1734b1683b59SHong Zhang c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1735e88777f2SHong Zhang c->N = Nbs; 1736e88777f2SHong Zhang c->m = c->M; 1737b1683b59SHong Zhang c->rstart = 0; 1738e88777f2SHong Zhang c->brows = 100; 1739b1683b59SHong Zhang 1740b1683b59SHong Zhang c->ncolors = nis; 1741dcca6d9dSJed Brown ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1742854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1743854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1744e88777f2SHong Zhang 1745e88777f2SHong Zhang brows = c->brows; 1746c5929fdfSBarry Smith ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1747e88777f2SHong Zhang if (flg) c->brows = brows; 1748eeb4fd02SHong Zhang if (brows > 0) { 1749854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1750e88777f2SHong Zhang } 17512205254eSKarl Rupp 1752d2853540SHong Zhang colorforrow[0] = 0; 1753d2853540SHong Zhang rows_i = rows; 1754f99a636bSHong Zhang den2sp_i = den2sp; 1755b1683b59SHong Zhang 1756854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1757854ce69bSBarry Smith ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 17582205254eSKarl Rupp 1759d2853540SHong Zhang colorforcol[0] = 0; 1760d2853540SHong Zhang columns_i = columns; 1761d2853540SHong Zhang 1762077f23c2SHong Zhang /* get column-wise storage of mat */ 1763077f23c2SHong Zhang ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1764b1683b59SHong Zhang 1765b28d80bdSHong Zhang cm = c->m; 1766854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1767854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1768f99a636bSHong Zhang for (i=0; i<nis; i++) { /* loop over color */ 1769b1683b59SHong Zhang ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1770b1683b59SHong Zhang ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 17712205254eSKarl Rupp 1772b1683b59SHong Zhang c->ncolumns[i] = n; 1773b1683b59SHong Zhang if (n) { 1774d2853540SHong Zhang ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr); 1775b1683b59SHong Zhang } 1776d2853540SHong Zhang colorforcol[i+1] = colorforcol[i] + n; 1777d2853540SHong Zhang columns_i += n; 1778b1683b59SHong Zhang 1779b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 1780e8354b3eSHong Zhang ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr); 1781e99be685SHong Zhang 1782b7caf3d6SHong Zhang for (j=0; j<n; j++) { /* loop over columns*/ 1783b1683b59SHong Zhang col = is[j]; 1784d2853540SHong Zhang row_idx = cj + ci[col]; 1785b1683b59SHong Zhang m = ci[col+1] - ci[col]; 1786b7caf3d6SHong Zhang for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1787e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1788d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1789b1683b59SHong Zhang } 1790b1683b59SHong Zhang } 1791b1683b59SHong Zhang /* count the number of hits */ 1792b1683b59SHong Zhang nrows = 0; 1793e8354b3eSHong Zhang for (j=0; j<cm; j++) { 1794b1683b59SHong Zhang if (rowhit[j]) nrows++; 1795b1683b59SHong Zhang } 1796b1683b59SHong Zhang c->nrows[i] = nrows; 1797d2853540SHong Zhang colorforrow[i+1] = colorforrow[i] + nrows; 1798d2853540SHong Zhang 1799b1683b59SHong Zhang nrows = 0; 1800b7caf3d6SHong Zhang for (j=0; j<cm; j++) { /* loop over rows */ 1801b1683b59SHong Zhang if (rowhit[j]) { 1802d2853540SHong Zhang rows_i[nrows] = j; 180312b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1804b1683b59SHong Zhang nrows++; 1805b1683b59SHong Zhang } 1806b1683b59SHong Zhang } 1807e88777f2SHong Zhang den2sp_i += nrows; 1808077f23c2SHong Zhang 1809b1683b59SHong Zhang ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1810f99a636bSHong Zhang rows_i += nrows; 1811b1683b59SHong Zhang } 18120298fd71SBarry Smith ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1813b28d80bdSHong Zhang ierr = PetscFree(rowhit);CHKERRQ(ierr); 1814b1683b59SHong Zhang ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr); 1815d2853540SHong Zhang if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]); 1816b28d80bdSHong Zhang 1817d2853540SHong Zhang c->colorforrow = colorforrow; 1818d2853540SHong Zhang c->rows = rows; 1819f99a636bSHong Zhang c->den2sp = den2sp; 1820d2853540SHong Zhang c->colorforcol = colorforcol; 1821d2853540SHong Zhang c->columns = columns; 18222205254eSKarl Rupp 1823f94ccd6cSHong Zhang ierr = PetscFree(idxhit);CHKERRQ(ierr); 1824b1683b59SHong Zhang PetscFunctionReturn(0); 1825b1683b59SHong Zhang } 1826d013fc79Sandi selinger 1827*df97dc6dSFande Kong /* The combine method has done the symbolic and numeric in the first phase, and so we just return here */ 1828*df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,Mat C) 1829*df97dc6dSFande Kong { 1830*df97dc6dSFande Kong PetscFunctionBegin; 1831*df97dc6dSFande Kong /* Empty function */ 1832*df97dc6dSFande Kong PetscFunctionReturn(0); 1833*df97dc6dSFande Kong } 1834*df97dc6dSFande Kong 183573b67375Sandi selinger /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */ 1836d013fc79Sandi selinger PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C) 1837d013fc79Sandi selinger { 1838d013fc79Sandi selinger PetscErrorCode ierr; 1839d013fc79Sandi selinger PetscLogDouble flops=0.0; 1840d013fc79Sandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 18412869b61bSandi selinger const PetscInt *ai=a->i,*bi=b->i; 1842d013fc79Sandi selinger PetscInt *ci,*cj,*cj_i; 1843d013fc79Sandi selinger PetscScalar *ca,*ca_i; 18442869b61bSandi selinger PetscInt b_maxmemrow,c_maxmem,a_col; 1845d013fc79Sandi selinger PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1846d013fc79Sandi selinger PetscInt i,k,ndouble=0; 1847d013fc79Sandi selinger PetscReal afill; 1848d013fc79Sandi selinger PetscScalar *c_row_val_dense; 1849d013fc79Sandi selinger PetscBool *c_row_idx_flags; 1850d013fc79Sandi selinger PetscInt *aj_i=a->j; 1851d013fc79Sandi selinger PetscScalar *aa_i=a->a; 1852d013fc79Sandi selinger 1853d013fc79Sandi selinger PetscFunctionBegin; 18542869b61bSandi selinger 18552869b61bSandi selinger /* Step 1: Determine upper bounds on memory for C and allocate memory */ 18562869b61bSandi selinger /* This should be enough for almost all matrices. If still more memory is needed, it is reallocated later. */ 18572869b61bSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 18582869b61bSandi selinger b_maxmemrow = PetscMin(bi[bm],bn); 1859d013fc79Sandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1860d013fc79Sandi selinger ierr = PetscMalloc1(bn,&c_row_val_dense);CHKERRQ(ierr); 1861d013fc79Sandi selinger ierr = PetscMalloc1(bn,&c_row_idx_flags);CHKERRQ(ierr); 1862d013fc79Sandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 1863d013fc79Sandi selinger ierr = PetscMalloc1(c_maxmem,&ca);CHKERRQ(ierr); 1864d013fc79Sandi selinger ca_i = ca; 1865d013fc79Sandi selinger cj_i = cj; 1866d013fc79Sandi selinger ci[0] = 0; 186773b67375Sandi selinger ierr = PetscMemzero(c_row_val_dense,bn*sizeof(PetscScalar));CHKERRQ(ierr); 186873b67375Sandi selinger ierr = PetscMemzero(c_row_idx_flags,bn*sizeof(PetscBool));CHKERRQ(ierr); 1869d013fc79Sandi selinger for (i=0; i<am; i++) { 1870d013fc79Sandi selinger /* Step 2: Initialize the dense row vector for C */ 1871d013fc79Sandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1872d013fc79Sandi selinger PetscInt cnzi = 0; 1873d013fc79Sandi selinger PetscInt *bj_i; 1874d013fc79Sandi selinger PetscScalar *ba_i; 18752869b61bSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory 18762869b61bSandi selinger Usually, there is enough memory in the first place, so this is not executed. */ 18772869b61bSandi selinger while (ci[i] + b_maxmemrow > c_maxmem) { 18782869b61bSandi selinger c_maxmem *= 2; 18792869b61bSandi selinger ndouble++; 1880928bb9adSStefano Zampini ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 1881928bb9adSStefano Zampini ierr = PetscRealloc(sizeof(PetscScalar)*c_maxmem,&ca);CHKERRQ(ierr); 18822869b61bSandi selinger } 1883d013fc79Sandi selinger 1884d013fc79Sandi selinger /* Step 3: Do the numerical calculations */ 1885d013fc79Sandi selinger for (a_col=0; a_col<anzi; a_col++) { /* iterate over all non zero values in a row of A */ 1886d013fc79Sandi selinger PetscInt a_col_index = aj_i[a_col]; 1887d013fc79Sandi selinger const PetscInt bnzi = bi[a_col_index+1] - bi[a_col_index]; 1888d013fc79Sandi selinger flops += 2*bnzi; 1889d013fc79Sandi selinger bj_i = b->j + bi[a_col_index]; /* points to the current row in bj */ 1890d013fc79Sandi selinger ba_i = b->a + bi[a_col_index]; /* points to the current row in ba */ 1891d013fc79Sandi selinger for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */ 1892d013fc79Sandi selinger if (c_row_idx_flags[bj_i[k]] == PETSC_FALSE) { 18932869b61bSandi selinger cj_i[cnzi++] = bj_i[k]; 1894d013fc79Sandi selinger c_row_idx_flags[bj_i[k]] = PETSC_TRUE; 1895d013fc79Sandi selinger } 1896d013fc79Sandi selinger c_row_val_dense[bj_i[k]] += aa_i[a_col] * ba_i[k]; 1897d013fc79Sandi selinger } 1898d013fc79Sandi selinger } 1899d013fc79Sandi selinger 1900d013fc79Sandi selinger /* Sort array */ 19013353a62bSKarl Rupp ierr = PetscSortInt(cnzi,cj_i);CHKERRQ(ierr); 1902d013fc79Sandi selinger /* Step 4 */ 1903d013fc79Sandi selinger for (k=0; k<cnzi; k++) { 1904d013fc79Sandi selinger ca_i[k] = c_row_val_dense[cj_i[k]]; 1905d013fc79Sandi selinger c_row_val_dense[cj_i[k]] = 0.; 1906d013fc79Sandi selinger c_row_idx_flags[cj_i[k]] = PETSC_FALSE; 1907d013fc79Sandi selinger } 1908d013fc79Sandi selinger /* terminate current row */ 1909d013fc79Sandi selinger aa_i += anzi; 1910d013fc79Sandi selinger aj_i += anzi; 1911d013fc79Sandi selinger ca_i += cnzi; 1912d013fc79Sandi selinger cj_i += cnzi; 1913d013fc79Sandi selinger ci[i+1] = ci[i] + cnzi; 1914d013fc79Sandi selinger flops += cnzi; 1915d013fc79Sandi selinger } 1916d013fc79Sandi selinger 1917d013fc79Sandi selinger /* Step 5 */ 1918d013fc79Sandi selinger /* Create the new matrix */ 1919d013fc79Sandi selinger ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 1920d013fc79Sandi selinger ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 192102fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1922d013fc79Sandi selinger 1923d013fc79Sandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1924d013fc79Sandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1925d013fc79Sandi selinger c = (Mat_SeqAIJ*)((*C)->data); 1926d013fc79Sandi selinger c->a = ca; 1927d013fc79Sandi selinger c->free_a = PETSC_TRUE; 1928d013fc79Sandi selinger c->free_ij = PETSC_TRUE; 1929d013fc79Sandi selinger c->nonew = 0; 1930d013fc79Sandi selinger 1931d013fc79Sandi selinger /* set MatInfo */ 1932d013fc79Sandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1933d013fc79Sandi selinger if (afill < 1.0) afill = 1.0; 1934d013fc79Sandi selinger c->maxnz = ci[am]; 1935d013fc79Sandi selinger c->nz = ci[am]; 1936d013fc79Sandi selinger (*C)->info.mallocs = ndouble; 1937d013fc79Sandi selinger (*C)->info.fill_ratio_given = fill; 1938d013fc79Sandi selinger (*C)->info.fill_ratio_needed = afill; 1939*df97dc6dSFande Kong (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined; 1940d013fc79Sandi selinger 194173b67375Sandi selinger ierr = PetscFree(c_row_val_dense);CHKERRQ(ierr); 194273b67375Sandi selinger ierr = PetscFree(c_row_idx_flags);CHKERRQ(ierr); 1943d013fc79Sandi selinger 1944d013fc79Sandi selinger ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1945d013fc79Sandi selinger ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1946d013fc79Sandi selinger ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1947d013fc79Sandi selinger PetscFunctionReturn(0); 1948d013fc79Sandi selinger } 1949