1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 1358cf0668SJed Brown static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*); 14cd093f1dSJed Brown 155e5acdf2Sstefano_zampini #if defined(PETSC_HAVE_HYPRE) 165e5acdf2Sstefano_zampini PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*); 175e5acdf2Sstefano_zampini #endif 185e5acdf2Sstefano_zampini 19150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 2038baddfdSBarry Smith { 21dfbe8321SBarry Smith PetscErrorCode ierr; 225e5acdf2Sstefano_zampini #if !defined(PETSC_HAVE_HYPRE) 23*d7ed1a4dSandi selinger const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"}; 24d013fc79Sandi selinger PetscInt nalg = 8; 25*d7ed1a4dSandi selinger #else 26*d7ed1a4dSandi selinger const char *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"}; 27*d7ed1a4dSandi selinger PetscInt nalg = 9; 285e5acdf2Sstefano_zampini #endif 296540a9cdSHong Zhang PetscInt alg = 0; /* set default algorithm */ 30d013fc79Sandi selinger PetscBool combined = PETSC_FALSE; /* Indicates whether the symbolic stage already computed the numerical values. */ 315c66b693SKris Buschelman 325c66b693SKris Buschelman PetscFunctionBegin; 3326be0446SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 34152983bfSHong Zhang ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 3568eacb73SHong Zhang PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */ 365e5acdf2Sstefano_zampini ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr); 37d8bbc50fSBarry Smith ierr = PetscOptionsEnd();CHKERRQ(ierr); 383ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 396540a9cdSHong Zhang switch (alg) { 406540a9cdSHong Zhang case 1: 41aacf854cSBarry Smith ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 426540a9cdSHong Zhang break; 436540a9cdSHong Zhang case 2: 446540a9cdSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 456540a9cdSHong Zhang break; 466540a9cdSHong Zhang case 3: 470ced3a2bSJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 486540a9cdSHong Zhang break; 496540a9cdSHong Zhang case 4: 508a07c6f1SJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 516540a9cdSHong Zhang break; 526540a9cdSHong Zhang case 5: 5358cf0668SJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 546540a9cdSHong Zhang break; 555e5acdf2Sstefano_zampini case 6: 56d013fc79Sandi selinger ierr = MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);CHKERRQ(ierr); 57d013fc79Sandi selinger combined = PETSC_TRUE; 58d013fc79Sandi selinger break; 59d013fc79Sandi selinger case 7: 60*d7ed1a4dSandi selinger ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 61*d7ed1a4dSandi selinger break; 62*d7ed1a4dSandi selinger #if defined(PETSC_HAVE_HYPRE) 63*d7ed1a4dSandi selinger case 8: 645e5acdf2Sstefano_zampini ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 655e5acdf2Sstefano_zampini break; 665e5acdf2Sstefano_zampini #endif 676540a9cdSHong Zhang default: 6826be0446SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 696540a9cdSHong Zhang break; 7025023028SHong Zhang } 713ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 7226be0446SHong Zhang } 735c913ed7SHong Zhang 743ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 75d013fc79Sandi selinger if (!combined) { 7601e47db4SHong Zhang ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr); 77d013fc79Sandi selinger } 783ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 795c66b693SKris Buschelman PetscFunctionReturn(0); 805c66b693SKris Buschelman } 811c24bd37SHong Zhang 8258cf0668SJed Brown static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C) 83b561aa9dSHong Zhang { 84b561aa9dSHong Zhang PetscErrorCode ierr; 85b561aa9dSHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 86971236abSHong Zhang PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 87c1ba5575SJed Brown PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 88b561aa9dSHong Zhang PetscReal afill; 89eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 90eca6b86aSHong Zhang PetscTable ta; 91fb908031SHong Zhang PetscBT lnkbt; 920298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 93b561aa9dSHong Zhang 94b561aa9dSHong Zhang PetscFunctionBegin; 95bd958071SHong Zhang /* Get ci and cj */ 96bd958071SHong Zhang /*---------------*/ 97fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 98fb908031SHong Zhang /* free space for accumulating nonzero column info */ 99854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 100fb908031SHong Zhang ci[0] = 0; 101fb908031SHong Zhang 102fb908031SHong Zhang /* create and initialize a linked list */ 103c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 104c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 105eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 106eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 107eca6b86aSHong Zhang 108eca6b86aSHong Zhang ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 109fb908031SHong Zhang 110fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 111f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 1122205254eSKarl Rupp 113fb908031SHong Zhang current_space = free_space; 114fb908031SHong Zhang 115fb908031SHong Zhang /* Determine ci and cj */ 116fb908031SHong Zhang for (i=0; i<am; i++) { 117fb908031SHong Zhang anzi = ai[i+1] - ai[i]; 118fb908031SHong Zhang aj = a->j + ai[i]; 119fb908031SHong Zhang for (j=0; j<anzi; j++) { 120fb908031SHong Zhang brow = aj[j]; 121fb908031SHong Zhang bnzj = bi[brow+1] - bi[brow]; 122fb908031SHong Zhang bj = b->j + bi[brow]; 123fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 124fb908031SHong Zhang ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 125fb908031SHong Zhang } 126fb908031SHong Zhang cnzi = lnk[0]; 127fb908031SHong Zhang 128fb908031SHong Zhang /* If free space is not available, make more free space */ 129fb908031SHong Zhang /* Double the amount of total space in the list */ 130fb908031SHong Zhang if (current_space->local_remaining<cnzi) { 131f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 132fb908031SHong Zhang ndouble++; 133fb908031SHong Zhang } 134fb908031SHong Zhang 135fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 136fb908031SHong Zhang ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1372205254eSKarl Rupp 138fb908031SHong Zhang current_space->array += cnzi; 139fb908031SHong Zhang current_space->local_used += cnzi; 140fb908031SHong Zhang current_space->local_remaining -= cnzi; 1412205254eSKarl Rupp 142fb908031SHong Zhang ci[i+1] = ci[i] + cnzi; 143fb908031SHong Zhang } 144fb908031SHong Zhang 145fb908031SHong Zhang /* Column indices are in the list of free space */ 146fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 147fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 148854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 149fb908031SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 150fb908031SHong Zhang ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 151b561aa9dSHong Zhang 15226be0446SHong Zhang /* put together the new symbolic matrix */ 153ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 15433d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 15502fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 15658c24d83SHong Zhang 15758c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 15858c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 15958c24d83SHong Zhang c = (Mat_SeqAIJ*)((*C)->data); 160aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 161e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 16258c24d83SHong Zhang c->nonew = 0; 163002e8affSHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */ 1640b7e3e3dSHong Zhang 1657212da7cSHong Zhang /* set MatInfo */ 1667212da7cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 167f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 1687212da7cSHong Zhang c->maxnz = ci[am]; 1697212da7cSHong Zhang c->nz = ci[am]; 170fb908031SHong Zhang (*C)->info.mallocs = ndouble; 1717212da7cSHong Zhang (*C)->info.fill_ratio_given = fill; 1727212da7cSHong Zhang (*C)->info.fill_ratio_needed = afill; 1737212da7cSHong Zhang 1747212da7cSHong Zhang #if defined(PETSC_USE_INFO) 1757212da7cSHong Zhang if (ci[am]) { 17657622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 17757622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 178f2b054eeSHong Zhang } else { 179f2b054eeSHong Zhang ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 180be0fcf8dSHong Zhang } 181f2b054eeSHong Zhang #endif 18258c24d83SHong Zhang PetscFunctionReturn(0); 18358c24d83SHong Zhang } 184d50806bdSBarry Smith 185dfbe8321SBarry Smith PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 186d50806bdSBarry Smith { 187dfbe8321SBarry Smith PetscErrorCode ierr; 188d13dce4bSSatish Balay PetscLogDouble flops=0.0; 189d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 190d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 191d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 19238baddfdSBarry Smith PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 193c58ca2e3SHong Zhang PetscInt am =A->rmap->n,cm=C->rmap->n; 194519eb980SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 195aa1aec99SHong Zhang PetscScalar *aa=a->a,*ba=b->a,*baj,*ca,valtmp; 196aa1aec99SHong Zhang PetscScalar *ab_dense; 197d50806bdSBarry Smith 198d50806bdSBarry Smith PetscFunctionBegin; 199aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 200854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 201aa1aec99SHong Zhang c->a = ca; 202aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 203aa1aec99SHong Zhang } else { 204aa1aec99SHong Zhang ca = c->a; 205d90d86d0SMatthew G. Knepley } 206d90d86d0SMatthew G. Knepley if (!c->matmult_abdense) { 2071795a4d1SJed Brown ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 208d90d86d0SMatthew G. Knepley c->matmult_abdense = ab_dense; 209d90d86d0SMatthew G. Knepley } else { 210aa1aec99SHong Zhang ab_dense = c->matmult_abdense; 211aa1aec99SHong Zhang } 212aa1aec99SHong Zhang 213c124e916SHong Zhang /* clean old values in C */ 214c124e916SHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 215d50806bdSBarry Smith /* Traverse A row-wise. */ 216d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 217d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 218d50806bdSBarry Smith for (i=0; i<am; i++) { 219d50806bdSBarry Smith anzi = ai[i+1] - ai[i]; 220d50806bdSBarry Smith for (j=0; j<anzi; j++) { 221519eb980SHong Zhang brow = aj[j]; 222d50806bdSBarry Smith bnzi = bi[brow+1] - bi[brow]; 223d50806bdSBarry Smith bjj = bj + bi[brow]; 224d50806bdSBarry Smith baj = ba + bi[brow]; 225519eb980SHong Zhang /* perform dense axpy */ 22636ec6d2dSHong Zhang valtmp = aa[j]; 227519eb980SHong Zhang for (k=0; k<bnzi; k++) { 22836ec6d2dSHong Zhang ab_dense[bjj[k]] += valtmp*baj[k]; 229519eb980SHong Zhang } 230519eb980SHong Zhang flops += 2*bnzi; 231519eb980SHong Zhang } 232c58ca2e3SHong Zhang aj += anzi; aa += anzi; 233c58ca2e3SHong Zhang 234c58ca2e3SHong Zhang cnzi = ci[i+1] - ci[i]; 235519eb980SHong Zhang for (k=0; k<cnzi; k++) { 236519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 237519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 238519eb980SHong Zhang } 239519eb980SHong Zhang flops += cnzi; 240519eb980SHong Zhang cj += cnzi; ca += cnzi; 241519eb980SHong Zhang } 242c58ca2e3SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 243c58ca2e3SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 244c58ca2e3SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 245c58ca2e3SHong Zhang PetscFunctionReturn(0); 246c58ca2e3SHong Zhang } 247c58ca2e3SHong Zhang 24825023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 249c58ca2e3SHong Zhang { 250c58ca2e3SHong Zhang PetscErrorCode ierr; 251c58ca2e3SHong Zhang PetscLogDouble flops=0.0; 252c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 253c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 255c58ca2e3SHong Zhang PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 256c58ca2e3SHong Zhang PetscInt am = A->rmap->N,cm=C->rmap->N; 257c58ca2e3SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 25836ec6d2dSHong Zhang PetscScalar *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp; 259c58ca2e3SHong Zhang PetscInt nextb; 260c58ca2e3SHong Zhang 261c58ca2e3SHong Zhang PetscFunctionBegin; 262cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 263cf742fccSHong Zhang ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 264cf742fccSHong Zhang c->a = ca; 265cf742fccSHong Zhang c->free_a = PETSC_TRUE; 266cf742fccSHong Zhang } 267cf742fccSHong Zhang 268c58ca2e3SHong Zhang /* clean old values in C */ 269c58ca2e3SHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 270c58ca2e3SHong Zhang /* Traverse A row-wise. */ 271c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 272c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 273519eb980SHong Zhang for (i=0; i<am; i++) { 274519eb980SHong Zhang anzi = ai[i+1] - ai[i]; 275519eb980SHong Zhang cnzi = ci[i+1] - ci[i]; 276519eb980SHong Zhang for (j=0; j<anzi; j++) { 277519eb980SHong Zhang brow = aj[j]; 278519eb980SHong Zhang bnzi = bi[brow+1] - bi[brow]; 279519eb980SHong Zhang bjj = bj + bi[brow]; 280519eb980SHong Zhang baj = ba + bi[brow]; 281519eb980SHong Zhang /* perform sparse axpy */ 28236ec6d2dSHong Zhang valtmp = aa[j]; 283c124e916SHong Zhang nextb = 0; 284c124e916SHong Zhang for (k=0; nextb<bnzi; k++) { 285c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 28636ec6d2dSHong Zhang ca[k] += valtmp*baj[nextb++]; 287c124e916SHong Zhang } 288d50806bdSBarry Smith } 289d50806bdSBarry Smith flops += 2*bnzi; 290d50806bdSBarry Smith } 291519eb980SHong Zhang aj += anzi; aa += anzi; 292519eb980SHong Zhang cj += cnzi; ca += cnzi; 293d50806bdSBarry Smith } 294c58ca2e3SHong Zhang 295716bacf3SKris Buschelman ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 296716bacf3SKris Buschelman ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 297d50806bdSBarry Smith ierr = PetscLogFlops(flops);CHKERRQ(ierr); 298d50806bdSBarry Smith PetscFunctionReturn(0); 299d50806bdSBarry Smith } 300bc011b1eSHong Zhang 3013c50cad2SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C) 30225296bd5SBarry Smith { 30325296bd5SBarry Smith PetscErrorCode ierr; 30425296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 30525296bd5SBarry Smith PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 3063c50cad2SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 30725296bd5SBarry Smith MatScalar *ca; 30825296bd5SBarry Smith PetscReal afill; 309eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 310eca6b86aSHong Zhang PetscTable ta; 3110298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 31225296bd5SBarry Smith 31325296bd5SBarry Smith PetscFunctionBegin; 3143c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 3153c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 3163c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 317854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 3183c50cad2SHong Zhang ci[0] = 0; 3193c50cad2SHong Zhang 3203c50cad2SHong Zhang /* create and initialize a linked list */ 321c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 322c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 323eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 324eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 325eca6b86aSHong Zhang 326eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 3273c50cad2SHong Zhang 3283c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 329f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 3303c50cad2SHong Zhang current_space = free_space; 3313c50cad2SHong Zhang 3323c50cad2SHong Zhang /* Determine ci and cj */ 3333c50cad2SHong Zhang for (i=0; i<am; i++) { 3343c50cad2SHong Zhang anzi = ai[i+1] - ai[i]; 3353c50cad2SHong Zhang aj = a->j + ai[i]; 3363c50cad2SHong Zhang for (j=0; j<anzi; j++) { 3373c50cad2SHong Zhang brow = aj[j]; 3383c50cad2SHong Zhang bnzj = bi[brow+1] - bi[brow]; 3393c50cad2SHong Zhang bj = b->j + bi[brow]; 3403c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 3413c50cad2SHong Zhang ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 3423c50cad2SHong Zhang } 3433c50cad2SHong Zhang cnzi = lnk[1]; 3443c50cad2SHong Zhang 3453c50cad2SHong Zhang /* If free space is not available, make more free space */ 3463c50cad2SHong Zhang /* Double the amount of total space in the list */ 3473c50cad2SHong Zhang if (current_space->local_remaining<cnzi) { 348f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 3493c50cad2SHong Zhang ndouble++; 3503c50cad2SHong Zhang } 3513c50cad2SHong Zhang 3523c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 3533c50cad2SHong Zhang ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 3542205254eSKarl Rupp 3553c50cad2SHong Zhang current_space->array += cnzi; 3563c50cad2SHong Zhang current_space->local_used += cnzi; 3573c50cad2SHong Zhang current_space->local_remaining -= cnzi; 3582205254eSKarl Rupp 3593c50cad2SHong Zhang ci[i+1] = ci[i] + cnzi; 3603c50cad2SHong Zhang } 3613c50cad2SHong Zhang 3623c50cad2SHong Zhang /* Column indices are in the list of free space */ 3633c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 3643c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 365854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 3663c50cad2SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 3673c50cad2SHong Zhang ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 36825296bd5SBarry Smith 36925296bd5SBarry Smith /* Allocate space for ca */ 370854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 37125296bd5SBarry Smith ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr); 37225296bd5SBarry Smith 37325296bd5SBarry Smith /* put together the new symbolic matrix */ 374ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr); 37533d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 37602fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 37725296bd5SBarry Smith 37825296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 37925296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 38025296bd5SBarry Smith c = (Mat_SeqAIJ*)((*C)->data); 38125296bd5SBarry Smith c->free_a = PETSC_TRUE; 38225296bd5SBarry Smith c->free_ij = PETSC_TRUE; 38325296bd5SBarry Smith c->nonew = 0; 3842205254eSKarl Rupp 38525296bd5SBarry Smith (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */ 38625296bd5SBarry Smith 38725296bd5SBarry Smith /* set MatInfo */ 38825296bd5SBarry Smith afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 38925296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 39025296bd5SBarry Smith c->maxnz = ci[am]; 39125296bd5SBarry Smith c->nz = ci[am]; 3923c50cad2SHong Zhang (*C)->info.mallocs = ndouble; 39325296bd5SBarry Smith (*C)->info.fill_ratio_given = fill; 39425296bd5SBarry Smith (*C)->info.fill_ratio_needed = afill; 39525296bd5SBarry Smith 39625296bd5SBarry Smith #if defined(PETSC_USE_INFO) 39725296bd5SBarry Smith if (ci[am]) { 39857622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 39957622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 40025296bd5SBarry Smith } else { 40125296bd5SBarry Smith ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 40225296bd5SBarry Smith } 40325296bd5SBarry Smith #endif 40425296bd5SBarry Smith PetscFunctionReturn(0); 40525296bd5SBarry Smith } 40625296bd5SBarry Smith 40725296bd5SBarry Smith 40825023028SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C) 409e9e4536cSHong Zhang { 410e9e4536cSHong Zhang PetscErrorCode ierr; 411e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 412bf9555e6SHong Zhang PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 41325c41797SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 414e9e4536cSHong Zhang MatScalar *ca; 415e9e4536cSHong Zhang PetscReal afill; 416eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 417eca6b86aSHong Zhang PetscTable ta; 4180298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 419e9e4536cSHong Zhang 420e9e4536cSHong Zhang PetscFunctionBegin; 421bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 422bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 423bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 424854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 425bd958071SHong Zhang ci[0] = 0; 426bd958071SHong Zhang 427bd958071SHong Zhang /* create and initialize a linked list */ 428c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 429c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 430eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 431eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 432eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 433bd958071SHong Zhang 434bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 435f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 436bd958071SHong Zhang current_space = free_space; 437bd958071SHong Zhang 438bd958071SHong Zhang /* Determine ci and cj */ 439bd958071SHong Zhang for (i=0; i<am; i++) { 440bd958071SHong Zhang anzi = ai[i+1] - ai[i]; 441bd958071SHong Zhang aj = a->j + ai[i]; 442bd958071SHong Zhang for (j=0; j<anzi; j++) { 443bd958071SHong Zhang brow = aj[j]; 444bd958071SHong Zhang bnzj = bi[brow+1] - bi[brow]; 445bd958071SHong Zhang bj = b->j + bi[brow]; 446bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 447bd958071SHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 448bd958071SHong Zhang } 449bd958071SHong Zhang cnzi = lnk[0]; 450bd958071SHong Zhang 451bd958071SHong Zhang /* If free space is not available, make more free space */ 452bd958071SHong Zhang /* Double the amount of total space in the list */ 453bd958071SHong Zhang if (current_space->local_remaining<cnzi) { 454f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 455bd958071SHong Zhang ndouble++; 456bd958071SHong Zhang } 457bd958071SHong Zhang 458bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 459bd958071SHong Zhang ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 4602205254eSKarl Rupp 461bd958071SHong Zhang current_space->array += cnzi; 462bd958071SHong Zhang current_space->local_used += cnzi; 463bd958071SHong Zhang current_space->local_remaining -= cnzi; 4642205254eSKarl Rupp 465bd958071SHong Zhang ci[i+1] = ci[i] + cnzi; 466bd958071SHong Zhang } 467bd958071SHong Zhang 468bd958071SHong Zhang /* Column indices are in the list of free space */ 469bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 470bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 471854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 472bd958071SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 473bd958071SHong Zhang ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 474e9e4536cSHong Zhang 475e9e4536cSHong Zhang /* Allocate space for ca */ 476bd958071SHong Zhang /*-----------------------*/ 477854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 478e9e4536cSHong Zhang ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr); 479e9e4536cSHong Zhang 480e9e4536cSHong Zhang /* put together the new symbolic matrix */ 481ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr); 48233d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 48302fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 484e9e4536cSHong Zhang 485e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 486e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 487e9e4536cSHong Zhang c = (Mat_SeqAIJ*)((*C)->data); 488e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 489e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 490e9e4536cSHong Zhang c->nonew = 0; 4912205254eSKarl Rupp 49225023028SHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */ 493e9e4536cSHong Zhang 494e9e4536cSHong Zhang /* set MatInfo */ 495e9e4536cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 496e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 497e9e4536cSHong Zhang c->maxnz = ci[am]; 498e9e4536cSHong Zhang c->nz = ci[am]; 499bd958071SHong Zhang (*C)->info.mallocs = ndouble; 500e9e4536cSHong Zhang (*C)->info.fill_ratio_given = fill; 501e9e4536cSHong Zhang (*C)->info.fill_ratio_needed = afill; 502e9e4536cSHong Zhang 503e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 504e9e4536cSHong Zhang if (ci[am]) { 50557622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 50657622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 507e9e4536cSHong Zhang } else { 508e9e4536cSHong Zhang ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 509e9e4536cSHong Zhang } 510e9e4536cSHong Zhang #endif 511e9e4536cSHong Zhang PetscFunctionReturn(0); 512e9e4536cSHong Zhang } 513e9e4536cSHong Zhang 5140ced3a2bSJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C) 5150ced3a2bSJed Brown { 5160ced3a2bSJed Brown PetscErrorCode ierr; 5170ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 5180ced3a2bSJed Brown const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 5190ced3a2bSJed Brown PetscInt *ci,*cj,*bb; 5200ced3a2bSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 5210ced3a2bSJed Brown PetscReal afill; 5220ced3a2bSJed Brown PetscInt i,j,col,ndouble = 0; 5230298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 5240ced3a2bSJed Brown PetscHeap h; 5250ced3a2bSJed Brown 5260ced3a2bSJed Brown PetscFunctionBegin; 527cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 5280ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 5290ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 530854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 5310ced3a2bSJed Brown ci[0] = 0; 5320ced3a2bSJed Brown 5330ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 534f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 5350ced3a2bSJed Brown current_space = free_space; 5360ced3a2bSJed Brown 5370ced3a2bSJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 538785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 5390ced3a2bSJed Brown 5400ced3a2bSJed Brown /* Determine ci and cj */ 5410ced3a2bSJed Brown for (i=0; i<am; i++) { 5420ced3a2bSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 5430ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 5440ced3a2bSJed Brown ci[i+1] = ci[i]; 5450ced3a2bSJed Brown /* Populate the min heap */ 5460ced3a2bSJed Brown for (j=0; j<anzi; j++) { 5470ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 5480ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 5490ced3a2bSJed Brown ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 5500ced3a2bSJed Brown } 5510ced3a2bSJed Brown } 5520ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 5530ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 5540ced3a2bSJed Brown while (j >= 0) { 5550ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 556f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 5570ced3a2bSJed Brown ndouble++; 5580ced3a2bSJed Brown } 5590ced3a2bSJed Brown *(current_space->array++) = col; 5600ced3a2bSJed Brown current_space->local_used++; 5610ced3a2bSJed Brown current_space->local_remaining--; 5620ced3a2bSJed Brown ci[i+1]++; 5630ced3a2bSJed Brown 5640ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 5650ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 5660ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 5670ced3a2bSJed Brown PetscInt j2,col2; 5680ced3a2bSJed Brown ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 5690ced3a2bSJed Brown if (col2 != col) break; 5700ced3a2bSJed Brown ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 5710ced3a2bSJed Brown if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 5720ced3a2bSJed Brown } 5730ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 5740ced3a2bSJed Brown ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 5750ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 5760ced3a2bSJed Brown } 5770ced3a2bSJed Brown } 5780ced3a2bSJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 5790ced3a2bSJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 5800ced3a2bSJed Brown 5810ced3a2bSJed Brown /* Column indices are in the list of free space */ 5820ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 5830ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 584785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 5850ced3a2bSJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 5860ced3a2bSJed Brown 5870ced3a2bSJed Brown /* put together the new symbolic matrix */ 588ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 58933d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 59002fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 5910ced3a2bSJed Brown 5920ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 5930ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5940ced3a2bSJed Brown c = (Mat_SeqAIJ*)((*C)->data); 5950ced3a2bSJed Brown c->free_a = PETSC_TRUE; 5960ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 5970ced3a2bSJed Brown c->nonew = 0; 59826fbe8dcSKarl Rupp 59989d95c1aSJed Brown (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 6000ced3a2bSJed Brown 6010ced3a2bSJed Brown /* set MatInfo */ 6020ced3a2bSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 6030ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6040ced3a2bSJed Brown c->maxnz = ci[am]; 6050ced3a2bSJed Brown c->nz = ci[am]; 6060ced3a2bSJed Brown (*C)->info.mallocs = ndouble; 6070ced3a2bSJed Brown (*C)->info.fill_ratio_given = fill; 6080ced3a2bSJed Brown (*C)->info.fill_ratio_needed = afill; 6090ced3a2bSJed Brown 6100ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 6110ced3a2bSJed Brown if (ci[am]) { 61257622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 61357622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 6140ced3a2bSJed Brown } else { 6150ced3a2bSJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 6160ced3a2bSJed Brown } 6170ced3a2bSJed Brown #endif 6180ced3a2bSJed Brown PetscFunctionReturn(0); 6190ced3a2bSJed Brown } 620e9e4536cSHong Zhang 6218a07c6f1SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C) 6228a07c6f1SJed Brown { 6238a07c6f1SJed Brown PetscErrorCode ierr; 6248a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 6258a07c6f1SJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 6268a07c6f1SJed Brown PetscInt *ci,*cj,*bb; 6278a07c6f1SJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 6288a07c6f1SJed Brown PetscReal afill; 6298a07c6f1SJed Brown PetscInt i,j,col,ndouble = 0; 6300298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 6318a07c6f1SJed Brown PetscHeap h; 6328a07c6f1SJed Brown PetscBT bt; 6338a07c6f1SJed Brown 6348a07c6f1SJed Brown PetscFunctionBegin; 635cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 6368a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 6378a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 638854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 6398a07c6f1SJed Brown ci[0] = 0; 6408a07c6f1SJed Brown 6418a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 642f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 6432205254eSKarl Rupp 6448a07c6f1SJed Brown current_space = free_space; 6458a07c6f1SJed Brown 6468a07c6f1SJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 647785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 6488a07c6f1SJed Brown ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 6498a07c6f1SJed Brown 6508a07c6f1SJed Brown /* Determine ci and cj */ 6518a07c6f1SJed Brown for (i=0; i<am; i++) { 6528a07c6f1SJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6538a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6548a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 6558a07c6f1SJed Brown ci[i+1] = ci[i]; 6568a07c6f1SJed Brown /* Populate the min heap */ 6578a07c6f1SJed Brown for (j=0; j<anzi; j++) { 6588a07c6f1SJed Brown PetscInt brow = acol[j]; 6598a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 6608a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 6618a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 6628a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 6638a07c6f1SJed Brown bb[j]++; 6648a07c6f1SJed Brown break; 6658a07c6f1SJed Brown } 6668a07c6f1SJed Brown } 6678a07c6f1SJed Brown } 6688a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 6698a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6708a07c6f1SJed Brown while (j >= 0) { 6718a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 6720298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 673f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 6748a07c6f1SJed Brown ndouble++; 6758a07c6f1SJed Brown } 6768a07c6f1SJed Brown *(current_space->array++) = col; 6778a07c6f1SJed Brown current_space->local_used++; 6788a07c6f1SJed Brown current_space->local_remaining--; 6798a07c6f1SJed Brown ci[i+1]++; 6808a07c6f1SJed Brown 6818a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 6828a07c6f1SJed Brown for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 6838a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 6848a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 6858a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 6868a07c6f1SJed Brown bb[j]++; 6878a07c6f1SJed Brown break; 6888a07c6f1SJed Brown } 6898a07c6f1SJed Brown } 6908a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6918a07c6f1SJed Brown } 6928a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 6938a07c6f1SJed Brown for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 6948a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 6958a07c6f1SJed Brown ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 6968a07c6f1SJed Brown } 6978a07c6f1SJed Brown } 6988a07c6f1SJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 6998a07c6f1SJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 7008a07c6f1SJed Brown ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 7018a07c6f1SJed Brown 7028a07c6f1SJed Brown /* Column indices are in the list of free space */ 7038a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7048a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 705785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 7068a07c6f1SJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 7078a07c6f1SJed Brown 7088a07c6f1SJed Brown /* put together the new symbolic matrix */ 709ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 71033d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 71102fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 7128a07c6f1SJed Brown 7138a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 7148a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 7158a07c6f1SJed Brown c = (Mat_SeqAIJ*)((*C)->data); 7168a07c6f1SJed Brown c->free_a = PETSC_TRUE; 7178a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 7188a07c6f1SJed Brown c->nonew = 0; 71926fbe8dcSKarl Rupp 72089d95c1aSJed Brown (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 7218a07c6f1SJed Brown 7228a07c6f1SJed Brown /* set MatInfo */ 7238a07c6f1SJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 7248a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 7258a07c6f1SJed Brown c->maxnz = ci[am]; 7268a07c6f1SJed Brown c->nz = ci[am]; 7278a07c6f1SJed Brown (*C)->info.mallocs = ndouble; 7288a07c6f1SJed Brown (*C)->info.fill_ratio_given = fill; 7298a07c6f1SJed Brown (*C)->info.fill_ratio_needed = afill; 7308a07c6f1SJed Brown 7318a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 7328a07c6f1SJed Brown if (ci[am]) { 73357622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 73457622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 7358a07c6f1SJed Brown } else { 7368a07c6f1SJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 7378a07c6f1SJed Brown } 7388a07c6f1SJed Brown #endif 7398a07c6f1SJed Brown PetscFunctionReturn(0); 7408a07c6f1SJed Brown } 7418a07c6f1SJed Brown 742*d7ed1a4dSandi selinger 743*d7ed1a4dSandi selinger PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C) 744*d7ed1a4dSandi selinger { 745*d7ed1a4dSandi selinger PetscErrorCode ierr; 746*d7ed1a4dSandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 747*d7ed1a4dSandi selinger const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 748*d7ed1a4dSandi selinger PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 749*d7ed1a4dSandi selinger PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 750*d7ed1a4dSandi selinger const PetscInt workcol[8] = {0,1,2,3,4,5,6,7}; 751*d7ed1a4dSandi selinger const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 752*d7ed1a4dSandi selinger const PetscInt *brow_ptr[8],*brow_end[8]; 753*d7ed1a4dSandi selinger PetscInt window[8]; 754*d7ed1a4dSandi selinger PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 755*d7ed1a4dSandi selinger PetscInt i,k,ndouble = 0,L1_rowsleft,rowsleft; 756*d7ed1a4dSandi selinger PetscReal afill; 757*d7ed1a4dSandi selinger PetscInt *workj_L1,*workj_L2,*workj_L3_in,*workj_L3_out; 758*d7ed1a4dSandi selinger PetscInt L2_nnz,L3_nnz; 759*d7ed1a4dSandi selinger PetscBool merge_from_2_arrays = PETSC_FALSE; 760*d7ed1a4dSandi selinger 761*d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 762*d7ed1a4dSandi selinger Because of the way virtual memory works, 763*d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 764*d7ed1a4dSandi selinger PetscFunctionBegin; 765*d7ed1a4dSandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 766*d7ed1a4dSandi selinger 767*d7ed1a4dSandi selinger for (i=0; i<am; i++) { 768*d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 769*d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 770*d7ed1a4dSandi selinger a_rownnz = 0; 771*d7ed1a4dSandi selinger for (k=0; k<anzi; ++k) { 772*d7ed1a4dSandi selinger a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 773*d7ed1a4dSandi selinger if (a_rownnz > bn) { 774*d7ed1a4dSandi selinger a_rownnz = bn; 775*d7ed1a4dSandi selinger break; 776*d7ed1a4dSandi selinger } 777*d7ed1a4dSandi selinger } 778*d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 779*d7ed1a4dSandi selinger } 780*d7ed1a4dSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 781*d7ed1a4dSandi selinger c_maxmem = 4*(ai[am]+bi[bm]); 782*d7ed1a4dSandi selinger 783*d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 784*d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 785*d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 786*d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz,&workj_L3_in);CHKERRQ(ierr); 787*d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz,&workj_L3_out);CHKERRQ(ierr); 788*d7ed1a4dSandi selinger 789*d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 790*d7ed1a4dSandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 791*d7ed1a4dSandi selinger 792*d7ed1a4dSandi selinger ci_nnz = 0; 793*d7ed1a4dSandi selinger ci[0] = 0; 794*d7ed1a4dSandi selinger worki_L1[0] = 0; 795*d7ed1a4dSandi selinger worki_L2[0] = 0; 796*d7ed1a4dSandi selinger worki_L2[1] = 0; 797*d7ed1a4dSandi selinger for (i=0; i<am; i++) { 798*d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 799*d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 800*d7ed1a4dSandi selinger rowsleft = anzi; 801*d7ed1a4dSandi selinger inputcol_L1 = acol; 802*d7ed1a4dSandi selinger L2_nnz = 0; 803*d7ed1a4dSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. workj_L3_in already exists -> initial value 1 */ 804*d7ed1a4dSandi selinger L3_nnz = 0; 805*d7ed1a4dSandi selinger 806*d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 807*d7ed1a4dSandi selinger while (ci_nnz+a_maxrownnz > c_maxmem) { 808*d7ed1a4dSandi selinger c_maxmem *= 2; 809*d7ed1a4dSandi selinger ndouble++; 810*d7ed1a4dSandi selinger ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 811*d7ed1a4dSandi selinger } 812*d7ed1a4dSandi selinger 813*d7ed1a4dSandi selinger while (rowsleft) { 814*d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 815*d7ed1a4dSandi selinger L1_nrows = 0; 816*d7ed1a4dSandi selinger outputi_nnz = 0; 817*d7ed1a4dSandi selinger inputcol = inputcol_L1; 818*d7ed1a4dSandi selinger inputi = bi; 819*d7ed1a4dSandi selinger inputj = bj; 820*d7ed1a4dSandi selinger 821*d7ed1a4dSandi selinger if (anzi > 8) outputj = workj_L1; /* Level 1 rowmerge*/ 822*d7ed1a4dSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 823*d7ed1a4dSandi selinger 824*d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 825*d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 826*d7ed1a4dSandi selinger Input: inputj inputi workj_L3_in L3_nnz inputcol bn 827*d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 828*d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 829*d7ed1a4dSandi selinger window_min = bn; \ 830*d7ed1a4dSandi selinger if (merge_from_2_arrays) { \ 831*d7ed1a4dSandi selinger brow_ptr[0] = workj_L3_in; \ 832*d7ed1a4dSandi selinger brow_end[0] = workj_L3_in + L3_nnz; \ 833*d7ed1a4dSandi selinger } else { \ 834*d7ed1a4dSandi selinger brow_ptr[0] = inputj + inputi[inputcol[0]]; \ 835*d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; \ 836*d7ed1a4dSandi selinger } \ 837*d7ed1a4dSandi selinger for (k=1; k<ANNZ; ++k) { \ 838*d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 839*d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 840*d7ed1a4dSandi selinger } \ 841*d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 842*d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 843*d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 844*d7ed1a4dSandi selinger } \ 845*d7ed1a4dSandi selinger while (window_min < bn) { \ 846*d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 847*d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 848*d7ed1a4dSandi selinger old_window_min = window_min; \ 849*d7ed1a4dSandi selinger window_min = bn; \ 850*d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 851*d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 852*d7ed1a4dSandi selinger brow_ptr[k]++; \ 853*d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 854*d7ed1a4dSandi selinger } \ 855*d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 856*d7ed1a4dSandi selinger } \ 857*d7ed1a4dSandi selinger } 858*d7ed1a4dSandi selinger 859*d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 860*d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 861*d7ed1a4dSandi selinger while (L1_rowsleft) { 862*d7ed1a4dSandi selinger switch (L1_rowsleft) { 863*d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 864*d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 865*d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 866*d7ed1a4dSandi selinger inputcol += L1_rowsleft; 867*d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 868*d7ed1a4dSandi selinger L1_rowsleft = 0; 869*d7ed1a4dSandi selinger break; 870*d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); 871*d7ed1a4dSandi selinger inputcol += L1_rowsleft; 872*d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 873*d7ed1a4dSandi selinger L1_rowsleft = 0; 874*d7ed1a4dSandi selinger break; 875*d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); 876*d7ed1a4dSandi selinger inputcol += L1_rowsleft; 877*d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 878*d7ed1a4dSandi selinger L1_rowsleft = 0; 879*d7ed1a4dSandi selinger break; 880*d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); 881*d7ed1a4dSandi selinger inputcol += L1_rowsleft; 882*d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 883*d7ed1a4dSandi selinger L1_rowsleft = 0; 884*d7ed1a4dSandi selinger break; 885*d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); 886*d7ed1a4dSandi selinger inputcol += L1_rowsleft; 887*d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 888*d7ed1a4dSandi selinger L1_rowsleft = 0; 889*d7ed1a4dSandi selinger break; 890*d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); 891*d7ed1a4dSandi selinger inputcol += L1_rowsleft; 892*d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 893*d7ed1a4dSandi selinger L1_rowsleft = 0; 894*d7ed1a4dSandi selinger break; 895*d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); 896*d7ed1a4dSandi selinger inputcol += L1_rowsleft; 897*d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 898*d7ed1a4dSandi selinger L1_rowsleft = 0; 899*d7ed1a4dSandi selinger break; 900*d7ed1a4dSandi selinger default: MatMatMultSymbolic_RowMergeMacro(8); 901*d7ed1a4dSandi selinger inputcol += 8; 902*d7ed1a4dSandi selinger rowsleft -= 8; 903*d7ed1a4dSandi selinger L1_rowsleft -= 8; 904*d7ed1a4dSandi selinger break; 905*d7ed1a4dSandi selinger } 906*d7ed1a4dSandi selinger inputcol_L1 = inputcol; 907*d7ed1a4dSandi selinger if (anzi > 8) worki_L1[++L1_nrows] = outputi_nnz; 908*d7ed1a4dSandi selinger } 909*d7ed1a4dSandi selinger 910*d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 911*d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 912*d7ed1a4dSandi selinger if (anzi > 8) { 913*d7ed1a4dSandi selinger inputi = worki_L1; 914*d7ed1a4dSandi selinger inputj = workj_L1; 915*d7ed1a4dSandi selinger inputcol = workcol; 916*d7ed1a4dSandi selinger outputi_nnz = 0; 917*d7ed1a4dSandi selinger 918*d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 919*d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 920*d7ed1a4dSandi selinger 921*d7ed1a4dSandi selinger switch (L1_nrows) { 922*d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 923*d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 924*d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 925*d7ed1a4dSandi selinger break; 926*d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 927*d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 928*d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 929*d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 930*d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 931*d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 932*d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 933*d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 934*d7ed1a4dSandi selinger } 935*d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 936*d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 937*d7ed1a4dSandi selinger 938*d7ed1a4dSandi selinger /******************************* L E V E L 3 *******************************/ 939*d7ed1a4dSandi selinger /* Merge from L2 work array and workj_L3_in to either C or to L3 work array */ 940*d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 941*d7ed1a4dSandi selinger inputi = worki_L2; 942*d7ed1a4dSandi selinger inputj = workj_L2; 943*d7ed1a4dSandi selinger inputcol = workcol; 944*d7ed1a4dSandi selinger outputi_nnz = 0; 945*d7ed1a4dSandi selinger if (rowsleft) outputj = workj_L3_out; 946*d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 947*d7ed1a4dSandi selinger merge_from_2_arrays = PETSC_TRUE; /* Instead of merging only from the array inputj, workj_L3_in is also used now. */ 948*d7ed1a4dSandi selinger switch (L2_nrows) { 949*d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 950*d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 951*d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 952*d7ed1a4dSandi selinger break; 953*d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 954*d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 955*d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 956*d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 957*d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 958*d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 959*d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 960*d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 961*d7ed1a4dSandi selinger } 962*d7ed1a4dSandi selinger merge_from_2_arrays = PETSC_FALSE; 963*d7ed1a4dSandi selinger L2_nrows = 1; 964*d7ed1a4dSandi selinger L2_nnz = 0; 965*d7ed1a4dSandi selinger L3_nnz = outputi_nnz; 966*d7ed1a4dSandi selinger /* Copy to workj_L3_in */ 967*d7ed1a4dSandi selinger if (rowsleft) { 968*d7ed1a4dSandi selinger for (k=0; k<outputi_nnz; ++k) workj_L3_in[k] = outputj[k]; 969*d7ed1a4dSandi selinger } 970*d7ed1a4dSandi selinger } 971*d7ed1a4dSandi selinger } 972*d7ed1a4dSandi selinger } /* while (rowsleft) */ 973*d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 974*d7ed1a4dSandi selinger 975*d7ed1a4dSandi selinger /* terminate current row */ 976*d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 977*d7ed1a4dSandi selinger ci[i+1] = ci_nnz; 978*d7ed1a4dSandi selinger } 979*d7ed1a4dSandi selinger 980*d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 981*d7ed1a4dSandi selinger ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 982*d7ed1a4dSandi selinger ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 983*d7ed1a4dSandi selinger 984*d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 985*d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 986*d7ed1a4dSandi selinger c = (Mat_SeqAIJ*)((*C)->data); 987*d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 988*d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 989*d7ed1a4dSandi selinger c->nonew = 0; 990*d7ed1a4dSandi selinger 991*d7ed1a4dSandi selinger (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 992*d7ed1a4dSandi selinger 993*d7ed1a4dSandi selinger /* set MatInfo */ 994*d7ed1a4dSandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 995*d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 996*d7ed1a4dSandi selinger c->maxnz = ci[am]; 997*d7ed1a4dSandi selinger c->nz = ci[am]; 998*d7ed1a4dSandi selinger (*C)->info.mallocs = ndouble; 999*d7ed1a4dSandi selinger (*C)->info.fill_ratio_given = fill; 1000*d7ed1a4dSandi selinger (*C)->info.fill_ratio_needed = afill; 1001*d7ed1a4dSandi selinger 1002*d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1003*d7ed1a4dSandi selinger if (ci[am]) { 1004*d7ed1a4dSandi selinger ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 1005*d7ed1a4dSandi selinger ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1006*d7ed1a4dSandi selinger } else { 1007*d7ed1a4dSandi selinger ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 1008*d7ed1a4dSandi selinger } 1009*d7ed1a4dSandi selinger #endif 1010*d7ed1a4dSandi selinger 1011*d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 1012*d7ed1a4dSandi selinger ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1013*d7ed1a4dSandi selinger ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1014*d7ed1a4dSandi selinger ierr = PetscFree(workj_L3_in);CHKERRQ(ierr); 1015*d7ed1a4dSandi selinger ierr = PetscFree(workj_L3_out);CHKERRQ(ierr); 1016*d7ed1a4dSandi selinger PetscFunctionReturn(0); 1017*d7ed1a4dSandi selinger } 1018*d7ed1a4dSandi selinger 1019cd093f1dSJed Brown /* concatenate unique entries and then sort */ 102058cf0668SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 1021cd093f1dSJed Brown { 1022cd093f1dSJed Brown PetscErrorCode ierr; 1023cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1024cd093f1dSJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 1025cd093f1dSJed Brown PetscInt *ci,*cj; 1026cd093f1dSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1027cd093f1dSJed Brown PetscReal afill; 1028cd093f1dSJed Brown PetscInt i,j,ndouble = 0; 1029cd093f1dSJed Brown PetscSegBuffer seg,segrow; 1030cd093f1dSJed Brown char *seen; 1031cd093f1dSJed Brown 1032cd093f1dSJed Brown PetscFunctionBegin; 1033854ce69bSBarry Smith ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1034cd093f1dSJed Brown ci[0] = 0; 1035cd093f1dSJed Brown 1036cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1037cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1038cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1039785e854fSJed Brown ierr = PetscMalloc1(bn,&seen);CHKERRQ(ierr); 1040cd093f1dSJed Brown ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr); 1041cd093f1dSJed Brown 1042cd093f1dSJed Brown /* Determine ci and cj */ 1043cd093f1dSJed Brown for (i=0; i<am; i++) { 1044cd093f1dSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1045cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1046cd093f1dSJed Brown PetscInt packlen = 0,*PETSC_RESTRICT crow; 1047cd093f1dSJed Brown /* Pack segrow */ 1048cd093f1dSJed Brown for (j=0; j<anzi; j++) { 1049cd093f1dSJed Brown PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1050cd093f1dSJed Brown for (k=bjstart; k<bjend; k++) { 1051cd093f1dSJed Brown PetscInt bcol = bj[k]; 1052cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1053cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 1054cd093f1dSJed Brown ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1055cd093f1dSJed Brown *slot = bcol; 1056cd093f1dSJed Brown seen[bcol] = 1; 1057cd093f1dSJed Brown packlen++; 1058cd093f1dSJed Brown } 1059cd093f1dSJed Brown } 1060cd093f1dSJed Brown } 1061cd093f1dSJed Brown ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1062cd093f1dSJed Brown ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1063cd093f1dSJed Brown ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1064cd093f1dSJed Brown ci[i+1] = ci[i] + packlen; 1065cd093f1dSJed Brown for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1066cd093f1dSJed Brown } 1067cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1068cd093f1dSJed Brown ierr = PetscFree(seen);CHKERRQ(ierr); 1069cd093f1dSJed Brown 1070cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 1071cd093f1dSJed Brown ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1072cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1073cd093f1dSJed Brown 1074cd093f1dSJed Brown /* put together the new symbolic matrix */ 1075cd093f1dSJed Brown ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 107633d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 107702fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1078cd093f1dSJed Brown 1079cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1080cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1081cd093f1dSJed Brown c = (Mat_SeqAIJ*)((*C)->data); 1082cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1083cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1084cd093f1dSJed Brown c->nonew = 0; 1085cd093f1dSJed Brown 1086cd093f1dSJed Brown (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 1087cd093f1dSJed Brown 1088cd093f1dSJed Brown /* set MatInfo */ 1089cd093f1dSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1090cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 1091cd093f1dSJed Brown c->maxnz = ci[am]; 1092cd093f1dSJed Brown c->nz = ci[am]; 1093cd093f1dSJed Brown (*C)->info.mallocs = ndouble; 1094cd093f1dSJed Brown (*C)->info.fill_ratio_given = fill; 1095cd093f1dSJed Brown (*C)->info.fill_ratio_needed = afill; 1096cd093f1dSJed Brown 1097cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1098cd093f1dSJed Brown if (ci[am]) { 109957622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 110057622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1101cd093f1dSJed Brown } else { 1102cd093f1dSJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 1103cd093f1dSJed Brown } 1104cd093f1dSJed Brown #endif 1105cd093f1dSJed Brown PetscFunctionReturn(0); 1106cd093f1dSJed Brown } 1107cd093f1dSJed Brown 1108d2853540SHong Zhang /* This routine is not used. Should be removed! */ 11096fc122caSHong Zhang PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 11105df89d91SHong Zhang { 1111bc011b1eSHong Zhang PetscErrorCode ierr; 1112bc011b1eSHong Zhang 1113bc011b1eSHong Zhang PetscFunctionBegin; 1114bc011b1eSHong Zhang if (scall == MAT_INITIAL_MATRIX) { 11153ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr); 11166fc122caSHong Zhang ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 11173ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr); 1118bc011b1eSHong Zhang } 11193ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr); 11206fc122caSHong Zhang ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr); 11213ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr); 1122bc011b1eSHong Zhang PetscFunctionReturn(0); 1123bc011b1eSHong Zhang } 1124bc011b1eSHong Zhang 11252128a86cSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A) 11262128a86cSHong Zhang { 11272128a86cSHong Zhang PetscErrorCode ierr; 11284c7df5ccSHong Zhang Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 112940192850SHong Zhang Mat_MatMatTransMult *abt=a->abt; 11302128a86cSHong Zhang 11312128a86cSHong Zhang PetscFunctionBegin; 113240192850SHong Zhang ierr = (abt->destroy)(A);CHKERRQ(ierr); 113340192850SHong Zhang ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 113440192850SHong Zhang ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 113540192850SHong Zhang ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 113640192850SHong Zhang ierr = PetscFree(abt);CHKERRQ(ierr); 11372128a86cSHong Zhang PetscFunctionReturn(0); 11382128a86cSHong Zhang } 11392128a86cSHong Zhang 11406fc122caSHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 1141bc011b1eSHong Zhang { 11425df89d91SHong Zhang PetscErrorCode ierr; 114381d82fe4SHong Zhang Mat Bt; 114481d82fe4SHong Zhang PetscInt *bti,*btj; 114540192850SHong Zhang Mat_MatMatTransMult *abt; 11464c7df5ccSHong Zhang Mat_SeqAIJ *c; 1147d2853540SHong Zhang 114881d82fe4SHong Zhang PetscFunctionBegin; 114981d82fe4SHong Zhang /* create symbolic Bt */ 115081d82fe4SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 11510298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 115233d57670SJed Brown ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 115304b858e0SBarry Smith ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 115481d82fe4SHong Zhang 115581d82fe4SHong Zhang /* get symbolic C=A*Bt */ 115681d82fe4SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 115781d82fe4SHong Zhang 11582128a86cSHong Zhang /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */ 1159b00a9115SJed Brown ierr = PetscNew(&abt);CHKERRQ(ierr); 11604c7df5ccSHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 116140192850SHong Zhang c->abt = abt; 11622128a86cSHong Zhang 116340192850SHong Zhang abt->usecoloring = PETSC_FALSE; 116440192850SHong Zhang abt->destroy = (*C)->ops->destroy; 11652128a86cSHong Zhang (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 11662128a86cSHong Zhang 1167c5929fdfSBarry Smith ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr); 116840192850SHong Zhang if (abt->usecoloring) { 1169b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1170b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1171335efc43SPeter Brune MatColoring coloring; 11722128a86cSHong Zhang ISColoring iscoloring; 11732128a86cSHong Zhang Mat Bt_dense,C_dense; 11744d478ae7SHong Zhang Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data; 11754d478ae7SHong Zhang /* inode causes memory problem, don't know why */ 11764d478ae7SHong Zhang if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'"); 1177e8354b3eSHong Zhang 1178335efc43SPeter Brune ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr); 1179335efc43SPeter Brune ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1180335efc43SPeter Brune ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1181335efc43SPeter Brune ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1182335efc43SPeter Brune ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1183335efc43SPeter Brune ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 1184b9af6bddSHong Zhang ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr); 11852205254eSKarl Rupp 118640192850SHong Zhang abt->matcoloring = matcoloring; 11872205254eSKarl Rupp 11882128a86cSHong Zhang ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 11892128a86cSHong Zhang 11902128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 11912128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 11922128a86cSHong Zhang ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 11932128a86cSHong Zhang ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 11940298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 11952205254eSKarl Rupp 11962128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 119740192850SHong Zhang abt->Bt_den = Bt_dense; 11982128a86cSHong Zhang 11992128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 12002128a86cSHong Zhang ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12012128a86cSHong Zhang ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 12020298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 12032205254eSKarl Rupp 12042128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 120540192850SHong Zhang abt->ABt_den = C_dense; 1206f94ccd6cSHong Zhang 1207f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12081ce71dffSSatish Balay { 1209f94ccd6cSHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data; 1210c40ebe3bSHong Zhang ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr); 12111ce71dffSSatish Balay } 1212f94ccd6cSHong Zhang #endif 12132128a86cSHong Zhang } 1214e99be685SHong Zhang /* clean up */ 1215e99be685SHong Zhang ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1216e99be685SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12175df89d91SHong Zhang PetscFunctionReturn(0); 12185df89d91SHong Zhang } 12195df89d91SHong Zhang 12206fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 12215df89d91SHong Zhang { 12225df89d91SHong Zhang PetscErrorCode ierr; 12235df89d91SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1224e2cac8caSJed Brown PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 122581d82fe4SHong Zhang PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 12265df89d91SHong Zhang PetscLogDouble flops=0.0; 1227aa1aec99SHong Zhang MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 122840192850SHong Zhang Mat_MatMatTransMult *abt = c->abt; 12295df89d91SHong Zhang 12305df89d91SHong Zhang PetscFunctionBegin; 123158ed3ceaSHong Zhang /* clear old values in C */ 123258ed3ceaSHong Zhang if (!c->a) { 1233854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 123458ed3ceaSHong Zhang c->a = ca; 123558ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 123658ed3ceaSHong Zhang } else { 123758ed3ceaSHong Zhang ca = c->a; 123858ed3ceaSHong Zhang } 123958ed3ceaSHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 124058ed3ceaSHong Zhang 124140192850SHong Zhang if (abt->usecoloring) { 124240192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 124340192850SHong Zhang Mat Bt_dense,C_dense = abt->ABt_den; 1244c8db22f6SHong Zhang 1245b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 124640192850SHong Zhang Bt_dense = abt->Bt_den; 1247b9af6bddSHong Zhang ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1248c8db22f6SHong Zhang 1249c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 12502128a86cSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1251c8db22f6SHong Zhang 12522128a86cSHong Zhang /* Recover C from C_dense */ 1253b9af6bddSHong Zhang ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1254c8db22f6SHong Zhang PetscFunctionReturn(0); 1255c8db22f6SHong Zhang } 1256c8db22f6SHong Zhang 12571fa1209cSHong Zhang for (i=0; i<cm; i++) { 125881d82fe4SHong Zhang anzi = ai[i+1] - ai[i]; 125981d82fe4SHong Zhang acol = aj + ai[i]; 12606973769bSHong Zhang aval = aa + ai[i]; 12611fa1209cSHong Zhang cnzi = ci[i+1] - ci[i]; 12621fa1209cSHong Zhang ccol = cj + ci[i]; 12636973769bSHong Zhang cval = ca + ci[i]; 12641fa1209cSHong Zhang for (j=0; j<cnzi; j++) { 126581d82fe4SHong Zhang brow = ccol[j]; 126681d82fe4SHong Zhang bnzj = bi[brow+1] - bi[brow]; 126781d82fe4SHong Zhang bcol = bj + bi[brow]; 12686973769bSHong Zhang bval = ba + bi[brow]; 12696973769bSHong Zhang 127081d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 127181d82fe4SHong Zhang nexta = 0; nextb = 0; 127281d82fe4SHong Zhang while (nexta<anzi && nextb<bnzj) { 12737b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 127481d82fe4SHong Zhang if (nexta == anzi) break; 12757b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 127681d82fe4SHong Zhang if (nextb == bnzj) break; 127781d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 12786973769bSHong Zhang cval[j] += aval[nexta]*bval[nextb]; 127981d82fe4SHong Zhang nexta++; nextb++; 128081d82fe4SHong Zhang flops += 2; 12811fa1209cSHong Zhang } 12821fa1209cSHong Zhang } 128381d82fe4SHong Zhang } 128481d82fe4SHong Zhang } 128581d82fe4SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 128681d82fe4SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 128781d82fe4SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 12885df89d91SHong Zhang PetscFunctionReturn(0); 12895df89d91SHong Zhang } 12905df89d91SHong Zhang 12916d373c3eSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A) 12926d373c3eSHong Zhang { 12936d373c3eSHong Zhang PetscErrorCode ierr; 12946d373c3eSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 12956d373c3eSHong Zhang Mat_MatTransMatMult *atb = a->atb; 12966d373c3eSHong Zhang 12976d373c3eSHong Zhang PetscFunctionBegin; 12986d373c3eSHong Zhang ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 12996d373c3eSHong Zhang ierr = (atb->destroy)(A);CHKERRQ(ierr); 13006d373c3eSHong Zhang ierr = PetscFree(atb);CHKERRQ(ierr); 13016d373c3eSHong Zhang PetscFunctionReturn(0); 13026d373c3eSHong Zhang } 13036d373c3eSHong Zhang 13040adebc6cSBarry Smith PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 13050adebc6cSBarry Smith { 13065df89d91SHong Zhang PetscErrorCode ierr; 13076d373c3eSHong Zhang const char *algTypes[2] = {"matmatmult","outerproduct"}; 13086d373c3eSHong Zhang PetscInt alg=0; /* set default algorithm */ 13096d373c3eSHong Zhang Mat At; 13106d373c3eSHong Zhang Mat_MatTransMatMult *atb; 13116d373c3eSHong Zhang Mat_SeqAIJ *c; 13125df89d91SHong Zhang 13135df89d91SHong Zhang PetscFunctionBegin; 13145df89d91SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 13156d373c3eSHong Zhang ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 13166d373c3eSHong Zhang PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */ 13176d373c3eSHong Zhang ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr); 13186d373c3eSHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 13196d373c3eSHong Zhang 13206d373c3eSHong Zhang switch (alg) { 13216d373c3eSHong Zhang case 1: 132275648e8dSHong Zhang ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 13236d373c3eSHong Zhang break; 13246d373c3eSHong Zhang default: 13256d373c3eSHong Zhang ierr = PetscNew(&atb);CHKERRQ(ierr); 13266d373c3eSHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 13276d373c3eSHong Zhang ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr); 13286d373c3eSHong Zhang 1329618cf492SHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 13306d373c3eSHong Zhang c->atb = atb; 13316d373c3eSHong Zhang atb->At = At; 13326d373c3eSHong Zhang atb->destroy = (*C)->ops->destroy; 13336d373c3eSHong Zhang (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 13346d373c3eSHong Zhang 13356d373c3eSHong Zhang break; 13365df89d91SHong Zhang } 13376d373c3eSHong Zhang } 13386d373c3eSHong Zhang if (alg) { 13396d373c3eSHong Zhang ierr = (*(*C)->ops->mattransposemultnumeric)(A,B,*C);CHKERRQ(ierr); 13406d373c3eSHong Zhang } else if (!alg && scall == MAT_REUSE_MATRIX) { 13416d373c3eSHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 13426d373c3eSHong Zhang atb = c->atb; 13436d373c3eSHong Zhang At = atb->At; 13446d373c3eSHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 13456d373c3eSHong Zhang ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);CHKERRQ(ierr); 13466d373c3eSHong Zhang } 13475df89d91SHong Zhang PetscFunctionReturn(0); 13485df89d91SHong Zhang } 13495df89d91SHong Zhang 135075648e8dSHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 13515df89d91SHong Zhang { 1352bc011b1eSHong Zhang PetscErrorCode ierr; 1353bc011b1eSHong Zhang Mat At; 135438baddfdSBarry Smith PetscInt *ati,*atj; 1355bc011b1eSHong Zhang 1356bc011b1eSHong Zhang PetscFunctionBegin; 1357bc011b1eSHong Zhang /* create symbolic At */ 1358bc011b1eSHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13590298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 136033d57670SJed Brown ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 136104b858e0SBarry Smith ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1362bc011b1eSHong Zhang 1363bc011b1eSHong Zhang /* get symbolic C=At*B */ 1364bc011b1eSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr); 1365bc011b1eSHong Zhang 1366bc011b1eSHong Zhang /* clean up */ 13676bf464f9SBarry Smith ierr = MatDestroy(&At);CHKERRQ(ierr); 1368bc011b1eSHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13696d373c3eSHong Zhang 13706d373c3eSHong Zhang (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; 1371bc011b1eSHong Zhang PetscFunctionReturn(0); 1372bc011b1eSHong Zhang } 1373bc011b1eSHong Zhang 137475648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1375bc011b1eSHong Zhang { 1376bc011b1eSHong Zhang PetscErrorCode ierr; 13770fbc74f4SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1378d0f46423SBarry Smith PetscInt am =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1379d0f46423SBarry Smith PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1380d13dce4bSSatish Balay PetscLogDouble flops=0.0; 1381aa1aec99SHong Zhang MatScalar *aa =a->a,*ba,*ca,*caj; 1382bc011b1eSHong Zhang 1383bc011b1eSHong Zhang PetscFunctionBegin; 1384aa1aec99SHong Zhang if (!c->a) { 1385854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 13862205254eSKarl Rupp 1387aa1aec99SHong Zhang c->a = ca; 1388aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1389aa1aec99SHong Zhang } else { 1390aa1aec99SHong Zhang ca = c->a; 1391aa1aec99SHong Zhang } 1392bc011b1eSHong Zhang /* clear old values in C */ 1393bc011b1eSHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 1394bc011b1eSHong Zhang 1395bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1396bc011b1eSHong Zhang for (i=0; i<am; i++) { 1397bc011b1eSHong Zhang bj = b->j + bi[i]; 1398bc011b1eSHong Zhang ba = b->a + bi[i]; 1399bc011b1eSHong Zhang bnzi = bi[i+1] - bi[i]; 1400bc011b1eSHong Zhang anzi = ai[i+1] - ai[i]; 1401bc011b1eSHong Zhang for (j=0; j<anzi; j++) { 1402bc011b1eSHong Zhang nextb = 0; 14030fbc74f4SHong Zhang crow = *aj++; 1404bc011b1eSHong Zhang cjj = cj + ci[crow]; 1405bc011b1eSHong Zhang caj = ca + ci[crow]; 1406bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1407bc011b1eSHong Zhang for (k=0; nextb<bnzi; k++) { 14080fbc74f4SHong Zhang if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 14090fbc74f4SHong Zhang caj[k] += (*aa)*(*(ba+nextb)); 1410bc011b1eSHong Zhang nextb++; 1411bc011b1eSHong Zhang } 1412bc011b1eSHong Zhang } 1413bc011b1eSHong Zhang flops += 2*bnzi; 14140fbc74f4SHong Zhang aa++; 1415bc011b1eSHong Zhang } 1416bc011b1eSHong Zhang } 1417bc011b1eSHong Zhang 1418bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 1419bc011b1eSHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1420bc011b1eSHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1421bc011b1eSHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1422bc011b1eSHong Zhang PetscFunctionReturn(0); 1423bc011b1eSHong Zhang } 14249123193aSHong Zhang 1425150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 14269123193aSHong Zhang { 14279123193aSHong Zhang PetscErrorCode ierr; 14289123193aSHong Zhang 14299123193aSHong Zhang PetscFunctionBegin; 14309123193aSHong Zhang if (scall == MAT_INITIAL_MATRIX) { 14313ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 14329123193aSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr); 14333ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 14349123193aSHong Zhang } 14353ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 14369123193aSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr); 14374614e006SHong Zhang ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 14389123193aSHong Zhang PetscFunctionReturn(0); 14399123193aSHong Zhang } 1440edd81eecSMatthew Knepley 14419123193aSHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C) 14429123193aSHong Zhang { 14439123193aSHong Zhang PetscErrorCode ierr; 14449123193aSHong Zhang 14459123193aSHong Zhang PetscFunctionBegin; 14465a586d82SBarry Smith ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 14472205254eSKarl Rupp 1448d73949e8SHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 14499123193aSHong Zhang PetscFunctionReturn(0); 14509123193aSHong Zhang } 14519123193aSHong Zhang 14529123193aSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 14539123193aSHong Zhang { 1454f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1455612438f1SStefano Zampini Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 14569123193aSHong Zhang PetscErrorCode ierr; 1457daf57211SHong Zhang PetscScalar *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp; 1458daf57211SHong Zhang const PetscScalar *aa,*b1,*b2,*b3,*b4; 1459daf57211SHong Zhang const PetscInt *aj; 1460612438f1SStefano Zampini PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 1461daf57211SHong Zhang PetscInt am4=4*am,bm4=4*bm,col,i,j,n,ajtmp; 14629123193aSHong Zhang 14639123193aSHong Zhang PetscFunctionBegin; 1464f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 1465612438f1SStefano Zampini if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 1466e32f2f54SBarry Smith if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 1467e32f2f54SBarry Smith if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 1468612438f1SStefano Zampini b = bd->v; 14698c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1470f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 1471730858b9SHong Zhang c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am; 1472f32d5d43SBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 1473f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1474f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1475f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1476f32d5d43SBarry Smith aj = a->j + a->i[i]; 1477f32d5d43SBarry Smith aa = a->a + a->i[i]; 1478f32d5d43SBarry Smith for (j=0; j<n; j++) { 1479730858b9SHong Zhang aatmp = aa[j]; ajtmp = aj[j]; 1480730858b9SHong Zhang r1 += aatmp*b1[ajtmp]; 1481730858b9SHong Zhang r2 += aatmp*b2[ajtmp]; 1482730858b9SHong Zhang r3 += aatmp*b3[ajtmp]; 1483730858b9SHong Zhang r4 += aatmp*b4[ajtmp]; 14849123193aSHong Zhang } 1485730858b9SHong Zhang c1[i] = r1; 1486730858b9SHong Zhang c2[i] = r2; 1487730858b9SHong Zhang c3[i] = r3; 1488730858b9SHong Zhang c4[i] = r4; 1489f32d5d43SBarry Smith } 1490730858b9SHong Zhang b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1491730858b9SHong Zhang c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1492f32d5d43SBarry Smith } 1493f32d5d43SBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 1494f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1495f32d5d43SBarry Smith r1 = 0.0; 1496f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1497f32d5d43SBarry Smith aj = a->j + a->i[i]; 1498f32d5d43SBarry Smith aa = a->a + a->i[i]; 1499f32d5d43SBarry Smith for (j=0; j<n; j++) { 1500730858b9SHong Zhang r1 += aa[j]*b1[aj[j]]; 1501f32d5d43SBarry Smith } 1502730858b9SHong Zhang c1[i] = r1; 1503f32d5d43SBarry Smith } 1504f32d5d43SBarry Smith b1 += bm; 1505730858b9SHong Zhang c1 += am; 1506f32d5d43SBarry Smith } 1507dc0b31edSSatish Balay ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 15088c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1509f32d5d43SBarry Smith ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1510f32d5d43SBarry Smith ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1511f32d5d43SBarry Smith PetscFunctionReturn(0); 1512f32d5d43SBarry Smith } 1513f32d5d43SBarry Smith 1514f32d5d43SBarry Smith /* 15154324174eSBarry Smith Note very similar to MatMult_SeqAIJ(), should generate both codes from same base 1516f32d5d43SBarry Smith */ 1517f32d5d43SBarry Smith PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1518f32d5d43SBarry Smith { 1519f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 152090f5ea3eSStefano Zampini Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1521f32d5d43SBarry Smith PetscErrorCode ierr; 1522dd6ea824SBarry Smith PetscScalar *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4; 1523dd6ea824SBarry Smith MatScalar *aa; 152490f5ea3eSStefano Zampini PetscInt cm = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm; 15254324174eSBarry Smith PetscInt am2 = 2*am, am3 = 3*am, bm4 = 4*bm,colam,*ridx; 1526f32d5d43SBarry Smith 1527f32d5d43SBarry Smith PetscFunctionBegin; 1528f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 152990f5ea3eSStefano Zampini b = bd->v; 15308c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1531f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 15324324174eSBarry Smith 15334324174eSBarry Smith if (a->compressedrow.use) { /* use compressed row format */ 15344324174eSBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 15354324174eSBarry Smith colam = col*am; 15364324174eSBarry Smith arm = a->compressedrow.nrows; 15374324174eSBarry Smith ii = a->compressedrow.i; 15384324174eSBarry Smith ridx = a->compressedrow.rindex; 15394324174eSBarry Smith for (i=0; i<arm; i++) { /* over rows of C in those columns */ 15404324174eSBarry Smith r1 = r2 = r3 = r4 = 0.0; 15414324174eSBarry Smith n = ii[i+1] - ii[i]; 15424324174eSBarry Smith aj = a->j + ii[i]; 15434324174eSBarry Smith aa = a->a + ii[i]; 15444324174eSBarry Smith for (j=0; j<n; j++) { 15454324174eSBarry Smith r1 += (*aa)*b1[*aj]; 15464324174eSBarry Smith r2 += (*aa)*b2[*aj]; 15474324174eSBarry Smith r3 += (*aa)*b3[*aj]; 15484324174eSBarry Smith r4 += (*aa++)*b4[*aj++]; 15494324174eSBarry Smith } 15504324174eSBarry Smith c[colam + ridx[i]] += r1; 15514324174eSBarry Smith c[colam + am + ridx[i]] += r2; 15524324174eSBarry Smith c[colam + am2 + ridx[i]] += r3; 15534324174eSBarry Smith c[colam + am3 + ridx[i]] += r4; 15544324174eSBarry Smith } 15554324174eSBarry Smith b1 += bm4; 15564324174eSBarry Smith b2 += bm4; 15574324174eSBarry Smith b3 += bm4; 15584324174eSBarry Smith b4 += bm4; 15594324174eSBarry Smith } 15604324174eSBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 15614324174eSBarry Smith colam = col*am; 15624324174eSBarry Smith arm = a->compressedrow.nrows; 15634324174eSBarry Smith ii = a->compressedrow.i; 15644324174eSBarry Smith ridx = a->compressedrow.rindex; 15654324174eSBarry Smith for (i=0; i<arm; i++) { /* over rows of C in those columns */ 15664324174eSBarry Smith r1 = 0.0; 15674324174eSBarry Smith n = ii[i+1] - ii[i]; 15684324174eSBarry Smith aj = a->j + ii[i]; 15694324174eSBarry Smith aa = a->a + ii[i]; 15704324174eSBarry Smith 15714324174eSBarry Smith for (j=0; j<n; j++) { 15724324174eSBarry Smith r1 += (*aa++)*b1[*aj++]; 15734324174eSBarry Smith } 1574a2ea699eSBarry Smith c[colam + ridx[i]] += r1; 15754324174eSBarry Smith } 15764324174eSBarry Smith b1 += bm; 15774324174eSBarry Smith } 15784324174eSBarry Smith } else { 1579f32d5d43SBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 1580f32d5d43SBarry Smith colam = col*am; 1581f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1582f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1583f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1584f32d5d43SBarry Smith aj = a->j + a->i[i]; 1585f32d5d43SBarry Smith aa = a->a + a->i[i]; 1586f32d5d43SBarry Smith for (j=0; j<n; j++) { 1587f32d5d43SBarry Smith r1 += (*aa)*b1[*aj]; 1588f32d5d43SBarry Smith r2 += (*aa)*b2[*aj]; 1589f32d5d43SBarry Smith r3 += (*aa)*b3[*aj]; 1590f32d5d43SBarry Smith r4 += (*aa++)*b4[*aj++]; 1591f32d5d43SBarry Smith } 1592f32d5d43SBarry Smith c[colam + i] += r1; 1593f32d5d43SBarry Smith c[colam + am + i] += r2; 1594f32d5d43SBarry Smith c[colam + am2 + i] += r3; 1595f32d5d43SBarry Smith c[colam + am3 + i] += r4; 1596f32d5d43SBarry Smith } 1597f32d5d43SBarry Smith b1 += bm4; 1598f32d5d43SBarry Smith b2 += bm4; 1599f32d5d43SBarry Smith b3 += bm4; 1600f32d5d43SBarry Smith b4 += bm4; 1601f32d5d43SBarry Smith } 1602f32d5d43SBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 1603a2ea699eSBarry Smith colam = col*am; 1604f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1605f32d5d43SBarry Smith r1 = 0.0; 1606f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1607f32d5d43SBarry Smith aj = a->j + a->i[i]; 1608f32d5d43SBarry Smith aa = a->a + a->i[i]; 1609f32d5d43SBarry Smith 1610f32d5d43SBarry Smith for (j=0; j<n; j++) { 1611f32d5d43SBarry Smith r1 += (*aa++)*b1[*aj++]; 1612f32d5d43SBarry Smith } 1613a2ea699eSBarry Smith c[colam + i] += r1; 1614f32d5d43SBarry Smith } 1615f32d5d43SBarry Smith b1 += bm; 1616f32d5d43SBarry Smith } 16174324174eSBarry Smith } 1618dc0b31edSSatish Balay ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr); 16198c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 16209123193aSHong Zhang PetscFunctionReturn(0); 16219123193aSHong Zhang } 1622b1683b59SHong Zhang 1623b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1624c8db22f6SHong Zhang { 1625c8db22f6SHong Zhang PetscErrorCode ierr; 16262f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 16272f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 16282f5f1f90SHong Zhang PetscInt *bi = b->i,*bj=b->j; 16292f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 16302f5f1f90SHong Zhang MatScalar *btval,*btval_den,*ba=b->a; 16312f5f1f90SHong Zhang PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1632c8db22f6SHong Zhang 1633c8db22f6SHong Zhang PetscFunctionBegin; 16342f5f1f90SHong Zhang btval_den=btdense->v; 16352f5f1f90SHong Zhang ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr); 16362f5f1f90SHong Zhang for (k=0; k<ncolors; k++) { 16372f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 16382f5f1f90SHong Zhang for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1639d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 16402f5f1f90SHong Zhang btcol = bj + bi[col]; 16412f5f1f90SHong Zhang btval = ba + bi[col]; 16422f5f1f90SHong Zhang anz = bi[col+1] - bi[col]; 1643d2853540SHong Zhang for (j=0; j<anz; j++) { 16442f5f1f90SHong Zhang brow = btcol[j]; 16452f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1646c8db22f6SHong Zhang } 1647c8db22f6SHong Zhang } 16482f5f1f90SHong Zhang btval_den += m; 1649c8db22f6SHong Zhang } 1650c8db22f6SHong Zhang PetscFunctionReturn(0); 1651c8db22f6SHong Zhang } 1652c8db22f6SHong Zhang 1653b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 16548972f759SHong Zhang { 16558972f759SHong Zhang PetscErrorCode ierr; 1656b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 1657077f23c2SHong Zhang PetscScalar *ca_den,*ca_den_ptr,*ca=csp->a; 1658f99a636bSHong Zhang PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1659e88777f2SHong Zhang PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1660077f23c2SHong Zhang PetscInt nrows,*row,*idx; 1661077f23c2SHong Zhang PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 16628972f759SHong Zhang 16638972f759SHong Zhang PetscFunctionBegin; 1664a3fe58edSHong Zhang ierr = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr); 1665f99a636bSHong Zhang 1666077f23c2SHong Zhang if (brows > 0) { 1667077f23c2SHong Zhang PetscInt *lstart,row_end,row_start; 1668980ae229SHong Zhang lstart = matcoloring->lstart; 1669eeb4fd02SHong Zhang ierr = PetscMemzero(lstart,ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1670eeb4fd02SHong Zhang 1671077f23c2SHong Zhang row_end = brows; 1672eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1673077f23c2SHong Zhang for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1674077f23c2SHong Zhang ca_den_ptr = ca_den; 1675980ae229SHong Zhang for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1676eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1677eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1678eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1679eeb4fd02SHong Zhang for (l=lstart[k]; l<nrows; l++) { 1680eeb4fd02SHong Zhang if (row[l] >= row_end) { 1681eeb4fd02SHong Zhang lstart[k] = l; 1682eeb4fd02SHong Zhang break; 1683eeb4fd02SHong Zhang } else { 1684077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1685eeb4fd02SHong Zhang } 1686eeb4fd02SHong Zhang } 1687077f23c2SHong Zhang ca_den_ptr += m; 1688eeb4fd02SHong Zhang } 1689077f23c2SHong Zhang row_end += brows; 1690eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1691eeb4fd02SHong Zhang } 1692077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1693077f23c2SHong Zhang ca_den_ptr = ca_den; 1694077f23c2SHong Zhang for (k=0; k<ncolors; k++) { 1695077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1696077f23c2SHong Zhang row = rows + colorforrow[k]; 1697077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1698077f23c2SHong Zhang for (l=0; l<nrows; l++) { 1699077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1700077f23c2SHong Zhang } 1701077f23c2SHong Zhang ca_den_ptr += m; 1702077f23c2SHong Zhang } 1703f99a636bSHong Zhang } 1704f99a636bSHong Zhang 1705a3fe58edSHong Zhang ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr); 1706f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1707077f23c2SHong Zhang if (matcoloring->brows > 0) { 1708f99a636bSHong Zhang ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr); 1709e88777f2SHong Zhang } else { 1710077f23c2SHong Zhang ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1711e88777f2SHong Zhang } 1712f99a636bSHong Zhang #endif 17138972f759SHong Zhang PetscFunctionReturn(0); 17148972f759SHong Zhang } 17158972f759SHong Zhang 1716b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1717b1683b59SHong Zhang { 1718b1683b59SHong Zhang PetscErrorCode ierr; 1719e88777f2SHong Zhang PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 17201a83f524SJed Brown const PetscInt *is,*ci,*cj,*row_idx; 1721b28d80bdSHong Zhang PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1722b1683b59SHong Zhang IS *isa; 1723b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1724e88777f2SHong Zhang PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1725e88777f2SHong Zhang PetscInt *colorforcol,*columns,*columns_i,brows; 1726e88777f2SHong Zhang PetscBool flg; 1727b1683b59SHong Zhang 1728b1683b59SHong Zhang PetscFunctionBegin; 1729b1683b59SHong Zhang ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1730e99be685SHong Zhang 17317ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1732e88777f2SHong Zhang Nbs = mat->cmap->N/bs; 1733b1683b59SHong Zhang c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1734e88777f2SHong Zhang c->N = Nbs; 1735e88777f2SHong Zhang c->m = c->M; 1736b1683b59SHong Zhang c->rstart = 0; 1737e88777f2SHong Zhang c->brows = 100; 1738b1683b59SHong Zhang 1739b1683b59SHong Zhang c->ncolors = nis; 1740dcca6d9dSJed Brown ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1741854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1742854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1743e88777f2SHong Zhang 1744e88777f2SHong Zhang brows = c->brows; 1745c5929fdfSBarry Smith ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1746e88777f2SHong Zhang if (flg) c->brows = brows; 1747eeb4fd02SHong Zhang if (brows > 0) { 1748854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1749e88777f2SHong Zhang } 17502205254eSKarl Rupp 1751d2853540SHong Zhang colorforrow[0] = 0; 1752d2853540SHong Zhang rows_i = rows; 1753f99a636bSHong Zhang den2sp_i = den2sp; 1754b1683b59SHong Zhang 1755854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1756854ce69bSBarry Smith ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 17572205254eSKarl Rupp 1758d2853540SHong Zhang colorforcol[0] = 0; 1759d2853540SHong Zhang columns_i = columns; 1760d2853540SHong Zhang 1761077f23c2SHong Zhang /* get column-wise storage of mat */ 1762077f23c2SHong Zhang ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1763b1683b59SHong Zhang 1764b28d80bdSHong Zhang cm = c->m; 1765854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1766854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1767f99a636bSHong Zhang for (i=0; i<nis; i++) { /* loop over color */ 1768b1683b59SHong Zhang ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1769b1683b59SHong Zhang ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 17702205254eSKarl Rupp 1771b1683b59SHong Zhang c->ncolumns[i] = n; 1772b1683b59SHong Zhang if (n) { 1773d2853540SHong Zhang ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr); 1774b1683b59SHong Zhang } 1775d2853540SHong Zhang colorforcol[i+1] = colorforcol[i] + n; 1776d2853540SHong Zhang columns_i += n; 1777b1683b59SHong Zhang 1778b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 1779e8354b3eSHong Zhang ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr); 1780e99be685SHong Zhang 1781b7caf3d6SHong Zhang for (j=0; j<n; j++) { /* loop over columns*/ 1782b1683b59SHong Zhang col = is[j]; 1783d2853540SHong Zhang row_idx = cj + ci[col]; 1784b1683b59SHong Zhang m = ci[col+1] - ci[col]; 1785b7caf3d6SHong Zhang for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1786e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1787d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1788b1683b59SHong Zhang } 1789b1683b59SHong Zhang } 1790b1683b59SHong Zhang /* count the number of hits */ 1791b1683b59SHong Zhang nrows = 0; 1792e8354b3eSHong Zhang for (j=0; j<cm; j++) { 1793b1683b59SHong Zhang if (rowhit[j]) nrows++; 1794b1683b59SHong Zhang } 1795b1683b59SHong Zhang c->nrows[i] = nrows; 1796d2853540SHong Zhang colorforrow[i+1] = colorforrow[i] + nrows; 1797d2853540SHong Zhang 1798b1683b59SHong Zhang nrows = 0; 1799b7caf3d6SHong Zhang for (j=0; j<cm; j++) { /* loop over rows */ 1800b1683b59SHong Zhang if (rowhit[j]) { 1801d2853540SHong Zhang rows_i[nrows] = j; 180212b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1803b1683b59SHong Zhang nrows++; 1804b1683b59SHong Zhang } 1805b1683b59SHong Zhang } 1806e88777f2SHong Zhang den2sp_i += nrows; 1807077f23c2SHong Zhang 1808b1683b59SHong Zhang ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1809f99a636bSHong Zhang rows_i += nrows; 1810b1683b59SHong Zhang } 18110298fd71SBarry Smith ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1812b28d80bdSHong Zhang ierr = PetscFree(rowhit);CHKERRQ(ierr); 1813b1683b59SHong Zhang ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr); 1814d2853540SHong Zhang if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]); 1815b28d80bdSHong Zhang 1816d2853540SHong Zhang c->colorforrow = colorforrow; 1817d2853540SHong Zhang c->rows = rows; 1818f99a636bSHong Zhang c->den2sp = den2sp; 1819d2853540SHong Zhang c->colorforcol = colorforcol; 1820d2853540SHong Zhang c->columns = columns; 18212205254eSKarl Rupp 1822f94ccd6cSHong Zhang ierr = PetscFree(idxhit);CHKERRQ(ierr); 1823b1683b59SHong Zhang PetscFunctionReturn(0); 1824b1683b59SHong Zhang } 1825d013fc79Sandi selinger 1826d013fc79Sandi selinger /* Needed for MatMatMult_SeqAIJ_SeqAIJ_Combined() */ 1827d013fc79Sandi selinger /* Append value to an array if the value is not present yet. A bitarray */ 1828d013fc79Sandi selinger /* was used to determine if there is already an entry at this position. */ 1829d013fc79Sandi selinger void appendToArray(PetscInt val, PetscInt *array, PetscInt *cnzi) 1830d013fc79Sandi selinger { 1831d013fc79Sandi selinger array[(*cnzi)++] = val; 1832d013fc79Sandi selinger } 1833d013fc79Sandi selinger 183473b67375Sandi selinger /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */ 1835d013fc79Sandi selinger PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C) 1836d013fc79Sandi selinger { 1837d013fc79Sandi selinger PetscErrorCode ierr; 1838d013fc79Sandi selinger PetscLogDouble flops=0.0; 1839d013fc79Sandi selinger Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 1840d013fc79Sandi selinger const PetscInt *ai = a->i,*bi = b->i, *aj = a->j; 1841d013fc79Sandi selinger PetscInt *ci,*cj,*cj_i; 1842d013fc79Sandi selinger PetscScalar *ca, *ca_i; 1843d013fc79Sandi selinger PetscInt c_maxmem = 0, a_maxrownnz = 0, a_rownnz, a_col; 1844d013fc79Sandi selinger PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1845d013fc79Sandi selinger PetscInt i, k, ndouble = 0; 1846d013fc79Sandi selinger PetscReal afill; 1847d013fc79Sandi selinger PetscScalar *c_row_val_dense; 1848d013fc79Sandi selinger PetscBool *c_row_idx_flags; 1849d013fc79Sandi selinger PetscInt *aj_i = a->j; 1850d013fc79Sandi selinger PetscScalar *aa_i = a->a; 1851d013fc79Sandi selinger 1852d013fc79Sandi selinger PetscFunctionBegin; 1853d013fc79Sandi selinger /* Step 1: Determine upper bounds on memory for C */ 185473b67375Sandi selinger for (i=0; i<am; i++) { /* iterate over all rows of A */ 1855d013fc79Sandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1856d013fc79Sandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1857d013fc79Sandi selinger a_rownnz = 0; 1858d013fc79Sandi selinger for (k=0;k<anzi;++k) a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 1859d013fc79Sandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 1860d013fc79Sandi selinger c_maxmem += a_rownnz; 1861d013fc79Sandi selinger } 1862d013fc79Sandi selinger ierr = PetscMalloc1(am+1, &ci); CHKERRQ(ierr); 1863d013fc79Sandi selinger ierr = PetscMalloc1(bn, &c_row_val_dense); CHKERRQ(ierr); 1864d013fc79Sandi selinger ierr = PetscMalloc1(bn, &c_row_idx_flags); CHKERRQ(ierr); 1865d013fc79Sandi selinger ierr = PetscMalloc1(c_maxmem,&cj); CHKERRQ(ierr); 1866d013fc79Sandi selinger ierr = PetscMalloc1(c_maxmem,&ca); CHKERRQ(ierr); 1867d013fc79Sandi selinger ca_i = ca; 1868d013fc79Sandi selinger cj_i = cj; 1869d013fc79Sandi selinger ci[0] = 0; 187073b67375Sandi selinger ierr = PetscMemzero(c_row_val_dense, bn * sizeof(PetscScalar));CHKERRQ(ierr); 187173b67375Sandi selinger ierr = PetscMemzero(c_row_idx_flags, bn * sizeof(PetscBool));CHKERRQ(ierr); 1872d013fc79Sandi selinger for (i=0; i<am; i++) { 1873d013fc79Sandi selinger /* Step 2: Initialize the dense row vector for C */ 1874d013fc79Sandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1875d013fc79Sandi selinger PetscInt cnzi = 0; 1876d013fc79Sandi selinger PetscInt *bj_i; 1877d013fc79Sandi selinger PetscScalar *ba_i; 1878d013fc79Sandi selinger 1879d013fc79Sandi selinger /* Step 3: Do the numerical calculations */ 1880d013fc79Sandi selinger for (a_col=0; a_col<anzi; a_col++) { /* iterate over all non zero values in a row of A */ 1881d013fc79Sandi selinger PetscInt a_col_index = aj_i[a_col]; 1882d013fc79Sandi selinger const PetscInt bnzi = bi[a_col_index+1] - bi[a_col_index]; 1883d013fc79Sandi selinger flops += 2*bnzi; 1884d013fc79Sandi selinger bj_i = b->j + bi[a_col_index]; /* points to the current row in bj */ 1885d013fc79Sandi selinger ba_i = b->a + bi[a_col_index]; /* points to the current row in ba */ 1886d013fc79Sandi selinger for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */ 1887d013fc79Sandi selinger if (c_row_idx_flags[ bj_i[k] ] == PETSC_FALSE) { 1888d013fc79Sandi selinger appendToArray(bj_i[k], cj_i, &cnzi); 1889d013fc79Sandi selinger c_row_idx_flags[ bj_i[k] ] = PETSC_TRUE; 1890d013fc79Sandi selinger } 1891d013fc79Sandi selinger c_row_val_dense[ bj_i[k] ] += aa_i[a_col] * ba_i[k]; 1892d013fc79Sandi selinger } 1893d013fc79Sandi selinger } 1894d013fc79Sandi selinger 1895d013fc79Sandi selinger /* Sort array */ 18963353a62bSKarl Rupp ierr = PetscSortInt(cnzi, cj_i);CHKERRQ(ierr); 1897d013fc79Sandi selinger /* Step 4 */ 1898d013fc79Sandi selinger for (k=0; k < cnzi; k++) { 1899d013fc79Sandi selinger ca_i[k] = c_row_val_dense[cj_i[k]]; 1900d013fc79Sandi selinger c_row_val_dense[cj_i[k]] = 0.; 1901d013fc79Sandi selinger c_row_idx_flags[cj_i[k]] = PETSC_FALSE; 1902d013fc79Sandi selinger } 1903d013fc79Sandi selinger /* terminate current row */ 1904d013fc79Sandi selinger aa_i += anzi; 1905d013fc79Sandi selinger aj_i += anzi; 1906d013fc79Sandi selinger ca_i += cnzi; 1907d013fc79Sandi selinger cj_i += cnzi; 1908d013fc79Sandi selinger ci[i+1] = ci[i] + cnzi; 1909d013fc79Sandi selinger flops += cnzi; 1910d013fc79Sandi selinger } 1911d013fc79Sandi selinger 1912d013fc79Sandi selinger /* Step 5 */ 1913d013fc79Sandi selinger /* Create the new matrix */ 1914d013fc79Sandi selinger ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 1915d013fc79Sandi selinger ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 191602fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1917d013fc79Sandi selinger 1918d013fc79Sandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1919d013fc79Sandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1920d013fc79Sandi selinger c = (Mat_SeqAIJ*)((*C)->data); 1921d013fc79Sandi selinger c->a = ca; 1922d013fc79Sandi selinger c->free_a = PETSC_TRUE; 1923d013fc79Sandi selinger c->free_ij = PETSC_TRUE; 1924d013fc79Sandi selinger c->nonew = 0; 1925d013fc79Sandi selinger 1926d013fc79Sandi selinger /* set MatInfo */ 1927d013fc79Sandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1928d013fc79Sandi selinger if (afill < 1.0) afill = 1.0; 1929d013fc79Sandi selinger c->maxnz = ci[am]; 1930d013fc79Sandi selinger c->nz = ci[am]; 1931d013fc79Sandi selinger (*C)->info.mallocs = ndouble; 1932d013fc79Sandi selinger (*C)->info.fill_ratio_given = fill; 1933d013fc79Sandi selinger (*C)->info.fill_ratio_needed = afill; 1934d013fc79Sandi selinger 193573b67375Sandi selinger ierr = PetscFree(c_row_val_dense);CHKERRQ(ierr); 193673b67375Sandi selinger ierr = PetscFree(c_row_idx_flags);CHKERRQ(ierr); 1937d013fc79Sandi selinger 1938d013fc79Sandi selinger ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1939d013fc79Sandi selinger ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1940d013fc79Sandi selinger ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1941d013fc79Sandi selinger PetscFunctionReturn(0); 1942d013fc79Sandi selinger } 1943