1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 1358cf0668SJed Brown static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*); 14cd093f1dSJed Brown 155e5acdf2Sstefano_zampini #if defined(PETSC_HAVE_HYPRE) 165e5acdf2Sstefano_zampini PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*); 175e5acdf2Sstefano_zampini #endif 185e5acdf2Sstefano_zampini 19150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 2038baddfdSBarry Smith { 21dfbe8321SBarry Smith PetscErrorCode ierr; 225e5acdf2Sstefano_zampini #if !defined(PETSC_HAVE_HYPRE) 23d7ed1a4dSandi selinger const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"}; 24d013fc79Sandi selinger PetscInt nalg = 8; 25d7ed1a4dSandi selinger #else 26d7ed1a4dSandi selinger const char *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"}; 27d7ed1a4dSandi selinger PetscInt nalg = 9; 285e5acdf2Sstefano_zampini #endif 296540a9cdSHong Zhang PetscInt alg = 0; /* set default algorithm */ 30d013fc79Sandi selinger PetscBool combined = PETSC_FALSE; /* Indicates whether the symbolic stage already computed the numerical values. */ 315c66b693SKris Buschelman 325c66b693SKris Buschelman PetscFunctionBegin; 3326be0446SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 34152983bfSHong Zhang ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 3568eacb73SHong Zhang PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */ 365e5acdf2Sstefano_zampini ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr); 37d8bbc50fSBarry Smith ierr = PetscOptionsEnd();CHKERRQ(ierr); 383ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 396540a9cdSHong Zhang switch (alg) { 406540a9cdSHong Zhang case 1: 41aacf854cSBarry Smith ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 426540a9cdSHong Zhang break; 436540a9cdSHong Zhang case 2: 446540a9cdSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 456540a9cdSHong Zhang break; 466540a9cdSHong Zhang case 3: 470ced3a2bSJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 486540a9cdSHong Zhang break; 496540a9cdSHong Zhang case 4: 508a07c6f1SJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 516540a9cdSHong Zhang break; 526540a9cdSHong Zhang case 5: 5358cf0668SJed Brown ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 546540a9cdSHong Zhang break; 555e5acdf2Sstefano_zampini case 6: 56d013fc79Sandi selinger ierr = MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);CHKERRQ(ierr); 57d013fc79Sandi selinger combined = PETSC_TRUE; 58d013fc79Sandi selinger break; 59d013fc79Sandi selinger case 7: 60d7ed1a4dSandi selinger ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 61d7ed1a4dSandi selinger break; 62d7ed1a4dSandi selinger #if defined(PETSC_HAVE_HYPRE) 63d7ed1a4dSandi selinger case 8: 645e5acdf2Sstefano_zampini ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 655e5acdf2Sstefano_zampini break; 665e5acdf2Sstefano_zampini #endif 676540a9cdSHong Zhang default: 6826be0446SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 696540a9cdSHong Zhang break; 7025023028SHong Zhang } 713ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 7226be0446SHong Zhang } 735c913ed7SHong Zhang 743ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 75d013fc79Sandi selinger if (!combined) { 7601e47db4SHong Zhang ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr); 77d013fc79Sandi selinger } 783ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 795c66b693SKris Buschelman PetscFunctionReturn(0); 805c66b693SKris Buschelman } 811c24bd37SHong Zhang 8258cf0668SJed Brown static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C) 83b561aa9dSHong Zhang { 84b561aa9dSHong Zhang PetscErrorCode ierr; 85b561aa9dSHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 86971236abSHong Zhang PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 87c1ba5575SJed Brown PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 88b561aa9dSHong Zhang PetscReal afill; 89eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 90eca6b86aSHong Zhang PetscTable ta; 91fb908031SHong Zhang PetscBT lnkbt; 920298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 93b561aa9dSHong Zhang 94b561aa9dSHong Zhang PetscFunctionBegin; 95bd958071SHong Zhang /* Get ci and cj */ 96bd958071SHong Zhang /*---------------*/ 97fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 98fb908031SHong Zhang /* free space for accumulating nonzero column info */ 99854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 100fb908031SHong Zhang ci[0] = 0; 101fb908031SHong Zhang 102fb908031SHong Zhang /* create and initialize a linked list */ 103c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 104c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 105eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 106eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 107eca6b86aSHong Zhang 108eca6b86aSHong Zhang ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 109fb908031SHong Zhang 110fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 111f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 1122205254eSKarl Rupp 113fb908031SHong Zhang current_space = free_space; 114fb908031SHong Zhang 115fb908031SHong Zhang /* Determine ci and cj */ 116fb908031SHong Zhang for (i=0; i<am; i++) { 117fb908031SHong Zhang anzi = ai[i+1] - ai[i]; 118fb908031SHong Zhang aj = a->j + ai[i]; 119fb908031SHong Zhang for (j=0; j<anzi; j++) { 120fb908031SHong Zhang brow = aj[j]; 121fb908031SHong Zhang bnzj = bi[brow+1] - bi[brow]; 122fb908031SHong Zhang bj = b->j + bi[brow]; 123fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 124fb908031SHong Zhang ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 125fb908031SHong Zhang } 126fb908031SHong Zhang cnzi = lnk[0]; 127fb908031SHong Zhang 128fb908031SHong Zhang /* If free space is not available, make more free space */ 129fb908031SHong Zhang /* Double the amount of total space in the list */ 130fb908031SHong Zhang if (current_space->local_remaining<cnzi) { 131f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 132fb908031SHong Zhang ndouble++; 133fb908031SHong Zhang } 134fb908031SHong Zhang 135fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 136fb908031SHong Zhang ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1372205254eSKarl Rupp 138fb908031SHong Zhang current_space->array += cnzi; 139fb908031SHong Zhang current_space->local_used += cnzi; 140fb908031SHong Zhang current_space->local_remaining -= cnzi; 1412205254eSKarl Rupp 142fb908031SHong Zhang ci[i+1] = ci[i] + cnzi; 143fb908031SHong Zhang } 144fb908031SHong Zhang 145fb908031SHong Zhang /* Column indices are in the list of free space */ 146fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 147fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 148854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 149fb908031SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 150fb908031SHong Zhang ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 151b561aa9dSHong Zhang 15226be0446SHong Zhang /* put together the new symbolic matrix */ 153ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 15433d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 15502fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 15658c24d83SHong Zhang 15758c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 15858c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 15958c24d83SHong Zhang c = (Mat_SeqAIJ*)((*C)->data); 160aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 161e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 16258c24d83SHong Zhang c->nonew = 0; 163002e8affSHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */ 1640b7e3e3dSHong Zhang 1657212da7cSHong Zhang /* set MatInfo */ 1667212da7cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 167f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 1687212da7cSHong Zhang c->maxnz = ci[am]; 1697212da7cSHong Zhang c->nz = ci[am]; 170fb908031SHong Zhang (*C)->info.mallocs = ndouble; 1717212da7cSHong Zhang (*C)->info.fill_ratio_given = fill; 1727212da7cSHong Zhang (*C)->info.fill_ratio_needed = afill; 1737212da7cSHong Zhang 1747212da7cSHong Zhang #if defined(PETSC_USE_INFO) 1757212da7cSHong Zhang if (ci[am]) { 17657622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 17757622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 178f2b054eeSHong Zhang } else { 179f2b054eeSHong Zhang ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 180be0fcf8dSHong Zhang } 181f2b054eeSHong Zhang #endif 18258c24d83SHong Zhang PetscFunctionReturn(0); 18358c24d83SHong Zhang } 184d50806bdSBarry Smith 185dfbe8321SBarry Smith PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 186d50806bdSBarry Smith { 187dfbe8321SBarry Smith PetscErrorCode ierr; 188d13dce4bSSatish Balay PetscLogDouble flops=0.0; 189d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 190d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 191d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 19238baddfdSBarry Smith PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 193c58ca2e3SHong Zhang PetscInt am =A->rmap->n,cm=C->rmap->n; 194519eb980SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 195aa1aec99SHong Zhang PetscScalar *aa=a->a,*ba=b->a,*baj,*ca,valtmp; 196aa1aec99SHong Zhang PetscScalar *ab_dense; 197d50806bdSBarry Smith 198d50806bdSBarry Smith PetscFunctionBegin; 199aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 200854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 201aa1aec99SHong Zhang c->a = ca; 202aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 203aa1aec99SHong Zhang } else { 204aa1aec99SHong Zhang ca = c->a; 205d90d86d0SMatthew G. Knepley } 206d90d86d0SMatthew G. Knepley if (!c->matmult_abdense) { 2071795a4d1SJed Brown ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 208d90d86d0SMatthew G. Knepley c->matmult_abdense = ab_dense; 209d90d86d0SMatthew G. Knepley } else { 210aa1aec99SHong Zhang ab_dense = c->matmult_abdense; 211aa1aec99SHong Zhang } 212aa1aec99SHong Zhang 213c124e916SHong Zhang /* clean old values in C */ 214c124e916SHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 215d50806bdSBarry Smith /* Traverse A row-wise. */ 216d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 217d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 218d50806bdSBarry Smith for (i=0; i<am; i++) { 219d50806bdSBarry Smith anzi = ai[i+1] - ai[i]; 220d50806bdSBarry Smith for (j=0; j<anzi; j++) { 221519eb980SHong Zhang brow = aj[j]; 222d50806bdSBarry Smith bnzi = bi[brow+1] - bi[brow]; 223d50806bdSBarry Smith bjj = bj + bi[brow]; 224d50806bdSBarry Smith baj = ba + bi[brow]; 225519eb980SHong Zhang /* perform dense axpy */ 22636ec6d2dSHong Zhang valtmp = aa[j]; 227519eb980SHong Zhang for (k=0; k<bnzi; k++) { 22836ec6d2dSHong Zhang ab_dense[bjj[k]] += valtmp*baj[k]; 229519eb980SHong Zhang } 230519eb980SHong Zhang flops += 2*bnzi; 231519eb980SHong Zhang } 232c58ca2e3SHong Zhang aj += anzi; aa += anzi; 233c58ca2e3SHong Zhang 234c58ca2e3SHong Zhang cnzi = ci[i+1] - ci[i]; 235519eb980SHong Zhang for (k=0; k<cnzi; k++) { 236519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 237519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 238519eb980SHong Zhang } 239519eb980SHong Zhang flops += cnzi; 240519eb980SHong Zhang cj += cnzi; ca += cnzi; 241519eb980SHong Zhang } 242c58ca2e3SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 243c58ca2e3SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 244c58ca2e3SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 245c58ca2e3SHong Zhang PetscFunctionReturn(0); 246c58ca2e3SHong Zhang } 247c58ca2e3SHong Zhang 24825023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 249c58ca2e3SHong Zhang { 250c58ca2e3SHong Zhang PetscErrorCode ierr; 251c58ca2e3SHong Zhang PetscLogDouble flops=0.0; 252c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 253c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 255c58ca2e3SHong Zhang PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 256c58ca2e3SHong Zhang PetscInt am = A->rmap->N,cm=C->rmap->N; 257c58ca2e3SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 25836ec6d2dSHong Zhang PetscScalar *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp; 259c58ca2e3SHong Zhang PetscInt nextb; 260c58ca2e3SHong Zhang 261c58ca2e3SHong Zhang PetscFunctionBegin; 262cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 263cf742fccSHong Zhang ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 264cf742fccSHong Zhang c->a = ca; 265cf742fccSHong Zhang c->free_a = PETSC_TRUE; 266cf742fccSHong Zhang } 267cf742fccSHong Zhang 268c58ca2e3SHong Zhang /* clean old values in C */ 269c58ca2e3SHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 270c58ca2e3SHong Zhang /* Traverse A row-wise. */ 271c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 272c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 273519eb980SHong Zhang for (i=0; i<am; i++) { 274519eb980SHong Zhang anzi = ai[i+1] - ai[i]; 275519eb980SHong Zhang cnzi = ci[i+1] - ci[i]; 276519eb980SHong Zhang for (j=0; j<anzi; j++) { 277519eb980SHong Zhang brow = aj[j]; 278519eb980SHong Zhang bnzi = bi[brow+1] - bi[brow]; 279519eb980SHong Zhang bjj = bj + bi[brow]; 280519eb980SHong Zhang baj = ba + bi[brow]; 281519eb980SHong Zhang /* perform sparse axpy */ 28236ec6d2dSHong Zhang valtmp = aa[j]; 283c124e916SHong Zhang nextb = 0; 284c124e916SHong Zhang for (k=0; nextb<bnzi; k++) { 285c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 28636ec6d2dSHong Zhang ca[k] += valtmp*baj[nextb++]; 287c124e916SHong Zhang } 288d50806bdSBarry Smith } 289d50806bdSBarry Smith flops += 2*bnzi; 290d50806bdSBarry Smith } 291519eb980SHong Zhang aj += anzi; aa += anzi; 292519eb980SHong Zhang cj += cnzi; ca += cnzi; 293d50806bdSBarry Smith } 294c58ca2e3SHong Zhang 295716bacf3SKris Buschelman ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 296716bacf3SKris Buschelman ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 297d50806bdSBarry Smith ierr = PetscLogFlops(flops);CHKERRQ(ierr); 298d50806bdSBarry Smith PetscFunctionReturn(0); 299d50806bdSBarry Smith } 300bc011b1eSHong Zhang 3013c50cad2SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C) 30225296bd5SBarry Smith { 30325296bd5SBarry Smith PetscErrorCode ierr; 30425296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 30525296bd5SBarry Smith PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 3063c50cad2SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 30725296bd5SBarry Smith MatScalar *ca; 30825296bd5SBarry Smith PetscReal afill; 309eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 310eca6b86aSHong Zhang PetscTable ta; 3110298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 31225296bd5SBarry Smith 31325296bd5SBarry Smith PetscFunctionBegin; 3143c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 3153c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 3163c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 317854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 3183c50cad2SHong Zhang ci[0] = 0; 3193c50cad2SHong Zhang 3203c50cad2SHong Zhang /* create and initialize a linked list */ 321c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 322c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 323eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 324eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 325eca6b86aSHong Zhang 326eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 3273c50cad2SHong Zhang 3283c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 329f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 3303c50cad2SHong Zhang current_space = free_space; 3313c50cad2SHong Zhang 3323c50cad2SHong Zhang /* Determine ci and cj */ 3333c50cad2SHong Zhang for (i=0; i<am; i++) { 3343c50cad2SHong Zhang anzi = ai[i+1] - ai[i]; 3353c50cad2SHong Zhang aj = a->j + ai[i]; 3363c50cad2SHong Zhang for (j=0; j<anzi; j++) { 3373c50cad2SHong Zhang brow = aj[j]; 3383c50cad2SHong Zhang bnzj = bi[brow+1] - bi[brow]; 3393c50cad2SHong Zhang bj = b->j + bi[brow]; 3403c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 3413c50cad2SHong Zhang ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 3423c50cad2SHong Zhang } 3433c50cad2SHong Zhang cnzi = lnk[1]; 3443c50cad2SHong Zhang 3453c50cad2SHong Zhang /* If free space is not available, make more free space */ 3463c50cad2SHong Zhang /* Double the amount of total space in the list */ 3473c50cad2SHong Zhang if (current_space->local_remaining<cnzi) { 348f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 3493c50cad2SHong Zhang ndouble++; 3503c50cad2SHong Zhang } 3513c50cad2SHong Zhang 3523c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 3533c50cad2SHong Zhang ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 3542205254eSKarl Rupp 3553c50cad2SHong Zhang current_space->array += cnzi; 3563c50cad2SHong Zhang current_space->local_used += cnzi; 3573c50cad2SHong Zhang current_space->local_remaining -= cnzi; 3582205254eSKarl Rupp 3593c50cad2SHong Zhang ci[i+1] = ci[i] + cnzi; 3603c50cad2SHong Zhang } 3613c50cad2SHong Zhang 3623c50cad2SHong Zhang /* Column indices are in the list of free space */ 3633c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 3643c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 365854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 3663c50cad2SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 3673c50cad2SHong Zhang ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 36825296bd5SBarry Smith 36925296bd5SBarry Smith /* Allocate space for ca */ 370854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 37125296bd5SBarry Smith ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr); 37225296bd5SBarry Smith 37325296bd5SBarry Smith /* put together the new symbolic matrix */ 374ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr); 37533d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 37602fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 37725296bd5SBarry Smith 37825296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 37925296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 38025296bd5SBarry Smith c = (Mat_SeqAIJ*)((*C)->data); 38125296bd5SBarry Smith c->free_a = PETSC_TRUE; 38225296bd5SBarry Smith c->free_ij = PETSC_TRUE; 38325296bd5SBarry Smith c->nonew = 0; 3842205254eSKarl Rupp 38525296bd5SBarry Smith (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */ 38625296bd5SBarry Smith 38725296bd5SBarry Smith /* set MatInfo */ 38825296bd5SBarry Smith afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 38925296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 39025296bd5SBarry Smith c->maxnz = ci[am]; 39125296bd5SBarry Smith c->nz = ci[am]; 3923c50cad2SHong Zhang (*C)->info.mallocs = ndouble; 39325296bd5SBarry Smith (*C)->info.fill_ratio_given = fill; 39425296bd5SBarry Smith (*C)->info.fill_ratio_needed = afill; 39525296bd5SBarry Smith 39625296bd5SBarry Smith #if defined(PETSC_USE_INFO) 39725296bd5SBarry Smith if (ci[am]) { 39857622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 39957622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 40025296bd5SBarry Smith } else { 40125296bd5SBarry Smith ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 40225296bd5SBarry Smith } 40325296bd5SBarry Smith #endif 40425296bd5SBarry Smith PetscFunctionReturn(0); 40525296bd5SBarry Smith } 40625296bd5SBarry Smith 40725296bd5SBarry Smith 40825023028SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C) 409e9e4536cSHong Zhang { 410e9e4536cSHong Zhang PetscErrorCode ierr; 411e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 412bf9555e6SHong Zhang PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 41325c41797SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 414e9e4536cSHong Zhang MatScalar *ca; 415e9e4536cSHong Zhang PetscReal afill; 416eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 417eca6b86aSHong Zhang PetscTable ta; 4180298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 419e9e4536cSHong Zhang 420e9e4536cSHong Zhang PetscFunctionBegin; 421bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 422bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 423bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 424854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 425bd958071SHong Zhang ci[0] = 0; 426bd958071SHong Zhang 427bd958071SHong Zhang /* create and initialize a linked list */ 428c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 429c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 430eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 431eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 432eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 433bd958071SHong Zhang 434bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 435f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 436bd958071SHong Zhang current_space = free_space; 437bd958071SHong Zhang 438bd958071SHong Zhang /* Determine ci and cj */ 439bd958071SHong Zhang for (i=0; i<am; i++) { 440bd958071SHong Zhang anzi = ai[i+1] - ai[i]; 441bd958071SHong Zhang aj = a->j + ai[i]; 442bd958071SHong Zhang for (j=0; j<anzi; j++) { 443bd958071SHong Zhang brow = aj[j]; 444bd958071SHong Zhang bnzj = bi[brow+1] - bi[brow]; 445bd958071SHong Zhang bj = b->j + bi[brow]; 446bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 447bd958071SHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 448bd958071SHong Zhang } 449bd958071SHong Zhang cnzi = lnk[0]; 450bd958071SHong Zhang 451bd958071SHong Zhang /* If free space is not available, make more free space */ 452bd958071SHong Zhang /* Double the amount of total space in the list */ 453bd958071SHong Zhang if (current_space->local_remaining<cnzi) { 454f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 455bd958071SHong Zhang ndouble++; 456bd958071SHong Zhang } 457bd958071SHong Zhang 458bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 459bd958071SHong Zhang ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 4602205254eSKarl Rupp 461bd958071SHong Zhang current_space->array += cnzi; 462bd958071SHong Zhang current_space->local_used += cnzi; 463bd958071SHong Zhang current_space->local_remaining -= cnzi; 4642205254eSKarl Rupp 465bd958071SHong Zhang ci[i+1] = ci[i] + cnzi; 466bd958071SHong Zhang } 467bd958071SHong Zhang 468bd958071SHong Zhang /* Column indices are in the list of free space */ 469bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 470bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 471854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 472bd958071SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 473bd958071SHong Zhang ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 474e9e4536cSHong Zhang 475e9e4536cSHong Zhang /* Allocate space for ca */ 476bd958071SHong Zhang /*-----------------------*/ 477854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 478e9e4536cSHong Zhang ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr); 479e9e4536cSHong Zhang 480e9e4536cSHong Zhang /* put together the new symbolic matrix */ 481ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr); 48233d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 48302fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 484e9e4536cSHong Zhang 485e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 486e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 487e9e4536cSHong Zhang c = (Mat_SeqAIJ*)((*C)->data); 488e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 489e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 490e9e4536cSHong Zhang c->nonew = 0; 4912205254eSKarl Rupp 49225023028SHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */ 493e9e4536cSHong Zhang 494e9e4536cSHong Zhang /* set MatInfo */ 495e9e4536cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 496e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 497e9e4536cSHong Zhang c->maxnz = ci[am]; 498e9e4536cSHong Zhang c->nz = ci[am]; 499bd958071SHong Zhang (*C)->info.mallocs = ndouble; 500e9e4536cSHong Zhang (*C)->info.fill_ratio_given = fill; 501e9e4536cSHong Zhang (*C)->info.fill_ratio_needed = afill; 502e9e4536cSHong Zhang 503e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 504e9e4536cSHong Zhang if (ci[am]) { 50557622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 50657622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 507e9e4536cSHong Zhang } else { 508e9e4536cSHong Zhang ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 509e9e4536cSHong Zhang } 510e9e4536cSHong Zhang #endif 511e9e4536cSHong Zhang PetscFunctionReturn(0); 512e9e4536cSHong Zhang } 513e9e4536cSHong Zhang 5140ced3a2bSJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C) 5150ced3a2bSJed Brown { 5160ced3a2bSJed Brown PetscErrorCode ierr; 5170ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 5180ced3a2bSJed Brown const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 5190ced3a2bSJed Brown PetscInt *ci,*cj,*bb; 5200ced3a2bSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 5210ced3a2bSJed Brown PetscReal afill; 5220ced3a2bSJed Brown PetscInt i,j,col,ndouble = 0; 5230298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 5240ced3a2bSJed Brown PetscHeap h; 5250ced3a2bSJed Brown 5260ced3a2bSJed Brown PetscFunctionBegin; 527cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 5280ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 5290ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 530854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 5310ced3a2bSJed Brown ci[0] = 0; 5320ced3a2bSJed Brown 5330ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 534f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 5350ced3a2bSJed Brown current_space = free_space; 5360ced3a2bSJed Brown 5370ced3a2bSJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 538785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 5390ced3a2bSJed Brown 5400ced3a2bSJed Brown /* Determine ci and cj */ 5410ced3a2bSJed Brown for (i=0; i<am; i++) { 5420ced3a2bSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 5430ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 5440ced3a2bSJed Brown ci[i+1] = ci[i]; 5450ced3a2bSJed Brown /* Populate the min heap */ 5460ced3a2bSJed Brown for (j=0; j<anzi; j++) { 5470ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 5480ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 5490ced3a2bSJed Brown ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 5500ced3a2bSJed Brown } 5510ced3a2bSJed Brown } 5520ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 5530ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 5540ced3a2bSJed Brown while (j >= 0) { 5550ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 556f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 5570ced3a2bSJed Brown ndouble++; 5580ced3a2bSJed Brown } 5590ced3a2bSJed Brown *(current_space->array++) = col; 5600ced3a2bSJed Brown current_space->local_used++; 5610ced3a2bSJed Brown current_space->local_remaining--; 5620ced3a2bSJed Brown ci[i+1]++; 5630ced3a2bSJed Brown 5640ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 5650ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 5660ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 5670ced3a2bSJed Brown PetscInt j2,col2; 5680ced3a2bSJed Brown ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 5690ced3a2bSJed Brown if (col2 != col) break; 5700ced3a2bSJed Brown ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 5710ced3a2bSJed Brown if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 5720ced3a2bSJed Brown } 5730ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 5740ced3a2bSJed Brown ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 5750ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 5760ced3a2bSJed Brown } 5770ced3a2bSJed Brown } 5780ced3a2bSJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 5790ced3a2bSJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 5800ced3a2bSJed Brown 5810ced3a2bSJed Brown /* Column indices are in the list of free space */ 5820ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 5830ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 584785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 5850ced3a2bSJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 5860ced3a2bSJed Brown 5870ced3a2bSJed Brown /* put together the new symbolic matrix */ 588ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 58933d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 59002fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 5910ced3a2bSJed Brown 5920ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 5930ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5940ced3a2bSJed Brown c = (Mat_SeqAIJ*)((*C)->data); 5950ced3a2bSJed Brown c->free_a = PETSC_TRUE; 5960ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 5970ced3a2bSJed Brown c->nonew = 0; 59826fbe8dcSKarl Rupp 59989d95c1aSJed Brown (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 6000ced3a2bSJed Brown 6010ced3a2bSJed Brown /* set MatInfo */ 6020ced3a2bSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 6030ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6040ced3a2bSJed Brown c->maxnz = ci[am]; 6050ced3a2bSJed Brown c->nz = ci[am]; 6060ced3a2bSJed Brown (*C)->info.mallocs = ndouble; 6070ced3a2bSJed Brown (*C)->info.fill_ratio_given = fill; 6080ced3a2bSJed Brown (*C)->info.fill_ratio_needed = afill; 6090ced3a2bSJed Brown 6100ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 6110ced3a2bSJed Brown if (ci[am]) { 61257622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 61357622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 6140ced3a2bSJed Brown } else { 6150ced3a2bSJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 6160ced3a2bSJed Brown } 6170ced3a2bSJed Brown #endif 6180ced3a2bSJed Brown PetscFunctionReturn(0); 6190ced3a2bSJed Brown } 620e9e4536cSHong Zhang 6218a07c6f1SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C) 6228a07c6f1SJed Brown { 6238a07c6f1SJed Brown PetscErrorCode ierr; 6248a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 6258a07c6f1SJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 6268a07c6f1SJed Brown PetscInt *ci,*cj,*bb; 6278a07c6f1SJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 6288a07c6f1SJed Brown PetscReal afill; 6298a07c6f1SJed Brown PetscInt i,j,col,ndouble = 0; 6300298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 6318a07c6f1SJed Brown PetscHeap h; 6328a07c6f1SJed Brown PetscBT bt; 6338a07c6f1SJed Brown 6348a07c6f1SJed Brown PetscFunctionBegin; 635cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 6368a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 6378a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 638854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 6398a07c6f1SJed Brown ci[0] = 0; 6408a07c6f1SJed Brown 6418a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 642f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 6432205254eSKarl Rupp 6448a07c6f1SJed Brown current_space = free_space; 6458a07c6f1SJed Brown 6468a07c6f1SJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 647785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 6488a07c6f1SJed Brown ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 6498a07c6f1SJed Brown 6508a07c6f1SJed Brown /* Determine ci and cj */ 6518a07c6f1SJed Brown for (i=0; i<am; i++) { 6528a07c6f1SJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6538a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6548a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 6558a07c6f1SJed Brown ci[i+1] = ci[i]; 6568a07c6f1SJed Brown /* Populate the min heap */ 6578a07c6f1SJed Brown for (j=0; j<anzi; j++) { 6588a07c6f1SJed Brown PetscInt brow = acol[j]; 6598a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 6608a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 6618a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 6628a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 6638a07c6f1SJed Brown bb[j]++; 6648a07c6f1SJed Brown break; 6658a07c6f1SJed Brown } 6668a07c6f1SJed Brown } 6678a07c6f1SJed Brown } 6688a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 6698a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6708a07c6f1SJed Brown while (j >= 0) { 6718a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 6720298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 673f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 6748a07c6f1SJed Brown ndouble++; 6758a07c6f1SJed Brown } 6768a07c6f1SJed Brown *(current_space->array++) = col; 6778a07c6f1SJed Brown current_space->local_used++; 6788a07c6f1SJed Brown current_space->local_remaining--; 6798a07c6f1SJed Brown ci[i+1]++; 6808a07c6f1SJed Brown 6818a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 6828a07c6f1SJed Brown for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 6838a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 6848a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 6858a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 6868a07c6f1SJed Brown bb[j]++; 6878a07c6f1SJed Brown break; 6888a07c6f1SJed Brown } 6898a07c6f1SJed Brown } 6908a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6918a07c6f1SJed Brown } 6928a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 6938a07c6f1SJed Brown for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 6948a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 6958a07c6f1SJed Brown ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 6968a07c6f1SJed Brown } 6978a07c6f1SJed Brown } 6988a07c6f1SJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 6998a07c6f1SJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 7008a07c6f1SJed Brown ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 7018a07c6f1SJed Brown 7028a07c6f1SJed Brown /* Column indices are in the list of free space */ 7038a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7048a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 705785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 7068a07c6f1SJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 7078a07c6f1SJed Brown 7088a07c6f1SJed Brown /* put together the new symbolic matrix */ 709ce94432eSBarry Smith ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 71033d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 71102fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 7128a07c6f1SJed Brown 7138a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 7148a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 7158a07c6f1SJed Brown c = (Mat_SeqAIJ*)((*C)->data); 7168a07c6f1SJed Brown c->free_a = PETSC_TRUE; 7178a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 7188a07c6f1SJed Brown c->nonew = 0; 71926fbe8dcSKarl Rupp 72089d95c1aSJed Brown (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 7218a07c6f1SJed Brown 7228a07c6f1SJed Brown /* set MatInfo */ 7238a07c6f1SJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 7248a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 7258a07c6f1SJed Brown c->maxnz = ci[am]; 7268a07c6f1SJed Brown c->nz = ci[am]; 7278a07c6f1SJed Brown (*C)->info.mallocs = ndouble; 7288a07c6f1SJed Brown (*C)->info.fill_ratio_given = fill; 7298a07c6f1SJed Brown (*C)->info.fill_ratio_needed = afill; 7308a07c6f1SJed Brown 7318a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 7328a07c6f1SJed Brown if (ci[am]) { 73357622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 73457622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 7358a07c6f1SJed Brown } else { 7368a07c6f1SJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 7378a07c6f1SJed Brown } 7388a07c6f1SJed Brown #endif 7398a07c6f1SJed Brown PetscFunctionReturn(0); 7408a07c6f1SJed Brown } 7418a07c6f1SJed Brown 742d7ed1a4dSandi selinger 743d7ed1a4dSandi selinger PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C) 744d7ed1a4dSandi selinger { 745d7ed1a4dSandi selinger PetscErrorCode ierr; 746d7ed1a4dSandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 747d7ed1a4dSandi selinger const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 748d7ed1a4dSandi selinger PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 749d7ed1a4dSandi selinger PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 750d7ed1a4dSandi selinger const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 751d7ed1a4dSandi selinger const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 752d7ed1a4dSandi selinger const PetscInt *brow_ptr[8],*brow_end[8]; 753d7ed1a4dSandi selinger PetscInt window[8]; 754d7ed1a4dSandi selinger PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 755d7ed1a4dSandi selinger PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 756d7ed1a4dSandi selinger PetscReal afill; 757f83700f2Sandi selinger PetscInt *workj_L1,*workj_L2,*workj_L3; 7587660c4dbSandi selinger PetscInt L1_nnz,L2_nnz; 759d7ed1a4dSandi selinger 760d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 761d7ed1a4dSandi selinger Because of the way virtual memory works, 762d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 763d7ed1a4dSandi selinger PetscFunctionBegin; 764d7ed1a4dSandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 765d7ed1a4dSandi selinger for (i=0; i<am; i++) { 766d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 767d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 768d7ed1a4dSandi selinger a_rownnz = 0; 769d7ed1a4dSandi selinger for (k=0; k<anzi; ++k) { 770d7ed1a4dSandi selinger a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 771d7ed1a4dSandi selinger if (a_rownnz > bn) { 772d7ed1a4dSandi selinger a_rownnz = bn; 773d7ed1a4dSandi selinger break; 774d7ed1a4dSandi selinger } 775d7ed1a4dSandi selinger } 776d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 777d7ed1a4dSandi selinger } 778d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 779d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 780f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 781f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr); 782d7ed1a4dSandi selinger 7837660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 7847660c4dbSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 785d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 786d7ed1a4dSandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 787d7ed1a4dSandi selinger 788d7ed1a4dSandi selinger ci_nnz = 0; 789d7ed1a4dSandi selinger ci[0] = 0; 790d7ed1a4dSandi selinger worki_L1[0] = 0; 791d7ed1a4dSandi selinger worki_L2[0] = 0; 792d7ed1a4dSandi selinger for (i=0; i<am; i++) { 793d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 794d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 795d7ed1a4dSandi selinger rowsleft = anzi; 796d7ed1a4dSandi selinger inputcol_L1 = acol; 797d7ed1a4dSandi selinger L2_nnz = 0; 7987660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 7997660c4dbSandi selinger worki_L2[1] = 0; 800*08fe1b3cSKarl Rupp outputi_nnz = 0; 801d7ed1a4dSandi selinger 802d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 803d7ed1a4dSandi selinger while (ci_nnz+a_maxrownnz > c_maxmem) { 804d7ed1a4dSandi selinger c_maxmem *= 2; 805d7ed1a4dSandi selinger ndouble++; 806d7ed1a4dSandi selinger ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 807d7ed1a4dSandi selinger } 808d7ed1a4dSandi selinger 809d7ed1a4dSandi selinger while (rowsleft) { 810d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 811d7ed1a4dSandi selinger L1_nrows = 0; 8127660c4dbSandi selinger L1_nnz = 0; 813d7ed1a4dSandi selinger inputcol = inputcol_L1; 814d7ed1a4dSandi selinger inputi = bi; 815d7ed1a4dSandi selinger inputj = bj; 816d7ed1a4dSandi selinger 817d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 818d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 819f83700f2Sandi selinger Input: inputj inputi inputcol bn 820d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 821d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 822d7ed1a4dSandi selinger window_min = bn; \ 8237660c4dbSandi selinger outputi_nnz = 0; \ 8247660c4dbSandi selinger for (k=0; k<ANNZ; ++k) { \ 825d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 826d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 827d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 828d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 829d7ed1a4dSandi selinger } \ 830d7ed1a4dSandi selinger while (window_min < bn) { \ 831d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 832d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 833d7ed1a4dSandi selinger old_window_min = window_min; \ 834d7ed1a4dSandi selinger window_min = bn; \ 835d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 836d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 837d7ed1a4dSandi selinger brow_ptr[k]++; \ 838d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 839d7ed1a4dSandi selinger } \ 840d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 841d7ed1a4dSandi selinger } \ 842d7ed1a4dSandi selinger } 843d7ed1a4dSandi selinger 844d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 845d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 846d7ed1a4dSandi selinger while (L1_rowsleft) { 8477660c4dbSandi selinger outputi_nnz = 0; 8487660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 8497660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 8507660c4dbSandi selinger 851d7ed1a4dSandi selinger switch (L1_rowsleft) { 852d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 853d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 854d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 855d7ed1a4dSandi selinger inputcol += L1_rowsleft; 856d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 857d7ed1a4dSandi selinger L1_rowsleft = 0; 858d7ed1a4dSandi selinger break; 859d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); 860d7ed1a4dSandi selinger inputcol += L1_rowsleft; 861d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 862d7ed1a4dSandi selinger L1_rowsleft = 0; 863d7ed1a4dSandi selinger break; 864d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); 865d7ed1a4dSandi selinger inputcol += L1_rowsleft; 866d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 867d7ed1a4dSandi selinger L1_rowsleft = 0; 868d7ed1a4dSandi selinger break; 869d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); 870d7ed1a4dSandi selinger inputcol += L1_rowsleft; 871d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 872d7ed1a4dSandi selinger L1_rowsleft = 0; 873d7ed1a4dSandi selinger break; 874d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); 875d7ed1a4dSandi selinger inputcol += L1_rowsleft; 876d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 877d7ed1a4dSandi selinger L1_rowsleft = 0; 878d7ed1a4dSandi selinger break; 879d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); 880d7ed1a4dSandi selinger inputcol += L1_rowsleft; 881d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 882d7ed1a4dSandi selinger L1_rowsleft = 0; 883d7ed1a4dSandi selinger break; 884d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); 885d7ed1a4dSandi selinger inputcol += L1_rowsleft; 886d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 887d7ed1a4dSandi selinger L1_rowsleft = 0; 888d7ed1a4dSandi selinger break; 889d7ed1a4dSandi selinger default: MatMatMultSymbolic_RowMergeMacro(8); 890d7ed1a4dSandi selinger inputcol += 8; 891d7ed1a4dSandi selinger rowsleft -= 8; 892d7ed1a4dSandi selinger L1_rowsleft -= 8; 893d7ed1a4dSandi selinger break; 894d7ed1a4dSandi selinger } 895d7ed1a4dSandi selinger inputcol_L1 = inputcol; 8967660c4dbSandi selinger L1_nnz += outputi_nnz; 8977660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 898d7ed1a4dSandi selinger } 899d7ed1a4dSandi selinger 900d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 901d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 902d7ed1a4dSandi selinger if (anzi > 8) { 903d7ed1a4dSandi selinger inputi = worki_L1; 904d7ed1a4dSandi selinger inputj = workj_L1; 905d7ed1a4dSandi selinger inputcol = workcol; 906d7ed1a4dSandi selinger outputi_nnz = 0; 907d7ed1a4dSandi selinger 908d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 909d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 910d7ed1a4dSandi selinger 911d7ed1a4dSandi selinger switch (L1_nrows) { 912d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 913d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 914d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 915d7ed1a4dSandi selinger break; 916d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 917d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 918d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 919d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 920d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 921d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 922d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 923d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 924d7ed1a4dSandi selinger } 925d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 926d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 927d7ed1a4dSandi selinger 9287660c4dbSandi selinger /************************ L E V E L 3 **********************/ 9297660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 930d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 931d7ed1a4dSandi selinger inputi = worki_L2; 932d7ed1a4dSandi selinger inputj = workj_L2; 933d7ed1a4dSandi selinger inputcol = workcol; 934d7ed1a4dSandi selinger outputi_nnz = 0; 935f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 936d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 937d7ed1a4dSandi selinger switch (L2_nrows) { 938d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 939d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 940d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 941d7ed1a4dSandi selinger break; 942d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 943d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 944d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 945d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 946d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 947d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 948d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 949d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 950d7ed1a4dSandi selinger } 951d7ed1a4dSandi selinger L2_nrows = 1; 9527660c4dbSandi selinger L2_nnz = outputi_nnz; 9537660c4dbSandi selinger worki_L2[1] = outputi_nnz; 9547660c4dbSandi selinger /* Copy to workj_L2 */ 955d7ed1a4dSandi selinger if (rowsleft) { 9567660c4dbSandi selinger for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 957d7ed1a4dSandi selinger } 958d7ed1a4dSandi selinger } 959d7ed1a4dSandi selinger } 960d7ed1a4dSandi selinger } /* while (rowsleft) */ 961d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 962d7ed1a4dSandi selinger 963d7ed1a4dSandi selinger /* terminate current row */ 964d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 965d7ed1a4dSandi selinger ci[i+1] = ci_nnz; 966d7ed1a4dSandi selinger } 967d7ed1a4dSandi selinger 968d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 969d7ed1a4dSandi selinger ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 970d7ed1a4dSandi selinger ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 971f83700f2Sandi selinger ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 972d7ed1a4dSandi selinger 973d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 974d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 975d7ed1a4dSandi selinger c = (Mat_SeqAIJ*)((*C)->data); 976d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 977d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 978d7ed1a4dSandi selinger c->nonew = 0; 979d7ed1a4dSandi selinger 980d7ed1a4dSandi selinger (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 981d7ed1a4dSandi selinger 982d7ed1a4dSandi selinger /* set MatInfo */ 983d7ed1a4dSandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 984d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 985d7ed1a4dSandi selinger c->maxnz = ci[am]; 986d7ed1a4dSandi selinger c->nz = ci[am]; 987d7ed1a4dSandi selinger (*C)->info.mallocs = ndouble; 988d7ed1a4dSandi selinger (*C)->info.fill_ratio_given = fill; 989d7ed1a4dSandi selinger (*C)->info.fill_ratio_needed = afill; 990d7ed1a4dSandi selinger 991d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 992d7ed1a4dSandi selinger if (ci[am]) { 993d7ed1a4dSandi selinger ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 994d7ed1a4dSandi selinger ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 995d7ed1a4dSandi selinger } else { 996d7ed1a4dSandi selinger ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 997d7ed1a4dSandi selinger } 998d7ed1a4dSandi selinger #endif 999d7ed1a4dSandi selinger 1000d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 1001d7ed1a4dSandi selinger ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1002d7ed1a4dSandi selinger ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1003f83700f2Sandi selinger ierr = PetscFree(workj_L3);CHKERRQ(ierr); 1004d7ed1a4dSandi selinger PetscFunctionReturn(0); 1005d7ed1a4dSandi selinger } 1006d7ed1a4dSandi selinger 1007cd093f1dSJed Brown /* concatenate unique entries and then sort */ 100858cf0668SJed Brown PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 1009cd093f1dSJed Brown { 1010cd093f1dSJed Brown PetscErrorCode ierr; 1011cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1012cd093f1dSJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 1013cd093f1dSJed Brown PetscInt *ci,*cj; 1014cd093f1dSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1015cd093f1dSJed Brown PetscReal afill; 1016cd093f1dSJed Brown PetscInt i,j,ndouble = 0; 1017cd093f1dSJed Brown PetscSegBuffer seg,segrow; 1018cd093f1dSJed Brown char *seen; 1019cd093f1dSJed Brown 1020cd093f1dSJed Brown PetscFunctionBegin; 1021854ce69bSBarry Smith ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1022cd093f1dSJed Brown ci[0] = 0; 1023cd093f1dSJed Brown 1024cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1025cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1026cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1027785e854fSJed Brown ierr = PetscMalloc1(bn,&seen);CHKERRQ(ierr); 1028cd093f1dSJed Brown ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr); 1029cd093f1dSJed Brown 1030cd093f1dSJed Brown /* Determine ci and cj */ 1031cd093f1dSJed Brown for (i=0; i<am; i++) { 1032cd093f1dSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1033cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1034cd093f1dSJed Brown PetscInt packlen = 0,*PETSC_RESTRICT crow; 1035cd093f1dSJed Brown /* Pack segrow */ 1036cd093f1dSJed Brown for (j=0; j<anzi; j++) { 1037cd093f1dSJed Brown PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1038cd093f1dSJed Brown for (k=bjstart; k<bjend; k++) { 1039cd093f1dSJed Brown PetscInt bcol = bj[k]; 1040cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1041cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 1042cd093f1dSJed Brown ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1043cd093f1dSJed Brown *slot = bcol; 1044cd093f1dSJed Brown seen[bcol] = 1; 1045cd093f1dSJed Brown packlen++; 1046cd093f1dSJed Brown } 1047cd093f1dSJed Brown } 1048cd093f1dSJed Brown } 1049cd093f1dSJed Brown ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1050cd093f1dSJed Brown ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1051cd093f1dSJed Brown ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1052cd093f1dSJed Brown ci[i+1] = ci[i] + packlen; 1053cd093f1dSJed Brown for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1054cd093f1dSJed Brown } 1055cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1056cd093f1dSJed Brown ierr = PetscFree(seen);CHKERRQ(ierr); 1057cd093f1dSJed Brown 1058cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 1059cd093f1dSJed Brown ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1060cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1061cd093f1dSJed Brown 1062cd093f1dSJed Brown /* put together the new symbolic matrix */ 1063cd093f1dSJed Brown ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 106433d57670SJed Brown ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 106502fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1066cd093f1dSJed Brown 1067cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1068cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1069cd093f1dSJed Brown c = (Mat_SeqAIJ*)((*C)->data); 1070cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1071cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1072cd093f1dSJed Brown c->nonew = 0; 1073cd093f1dSJed Brown 1074cd093f1dSJed Brown (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; 1075cd093f1dSJed Brown 1076cd093f1dSJed Brown /* set MatInfo */ 1077cd093f1dSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1078cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 1079cd093f1dSJed Brown c->maxnz = ci[am]; 1080cd093f1dSJed Brown c->nz = ci[am]; 1081cd093f1dSJed Brown (*C)->info.mallocs = ndouble; 1082cd093f1dSJed Brown (*C)->info.fill_ratio_given = fill; 1083cd093f1dSJed Brown (*C)->info.fill_ratio_needed = afill; 1084cd093f1dSJed Brown 1085cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1086cd093f1dSJed Brown if (ci[am]) { 108757622a8eSBarry Smith ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 108857622a8eSBarry Smith ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1089cd093f1dSJed Brown } else { 1090cd093f1dSJed Brown ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr); 1091cd093f1dSJed Brown } 1092cd093f1dSJed Brown #endif 1093cd093f1dSJed Brown PetscFunctionReturn(0); 1094cd093f1dSJed Brown } 1095cd093f1dSJed Brown 1096d2853540SHong Zhang /* This routine is not used. Should be removed! */ 10976fc122caSHong Zhang PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 10985df89d91SHong Zhang { 1099bc011b1eSHong Zhang PetscErrorCode ierr; 1100bc011b1eSHong Zhang 1101bc011b1eSHong Zhang PetscFunctionBegin; 1102bc011b1eSHong Zhang if (scall == MAT_INITIAL_MATRIX) { 11033ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr); 11046fc122caSHong Zhang ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 11053ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr); 1106bc011b1eSHong Zhang } 11073ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr); 11086fc122caSHong Zhang ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr); 11093ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr); 1110bc011b1eSHong Zhang PetscFunctionReturn(0); 1111bc011b1eSHong Zhang } 1112bc011b1eSHong Zhang 11132128a86cSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A) 11142128a86cSHong Zhang { 11152128a86cSHong Zhang PetscErrorCode ierr; 11164c7df5ccSHong Zhang Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 111740192850SHong Zhang Mat_MatMatTransMult *abt=a->abt; 11182128a86cSHong Zhang 11192128a86cSHong Zhang PetscFunctionBegin; 112040192850SHong Zhang ierr = (abt->destroy)(A);CHKERRQ(ierr); 112140192850SHong Zhang ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 112240192850SHong Zhang ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 112340192850SHong Zhang ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 112440192850SHong Zhang ierr = PetscFree(abt);CHKERRQ(ierr); 11252128a86cSHong Zhang PetscFunctionReturn(0); 11262128a86cSHong Zhang } 11272128a86cSHong Zhang 11286fc122caSHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 1129bc011b1eSHong Zhang { 11305df89d91SHong Zhang PetscErrorCode ierr; 113181d82fe4SHong Zhang Mat Bt; 113281d82fe4SHong Zhang PetscInt *bti,*btj; 113340192850SHong Zhang Mat_MatMatTransMult *abt; 11344c7df5ccSHong Zhang Mat_SeqAIJ *c; 1135d2853540SHong Zhang 113681d82fe4SHong Zhang PetscFunctionBegin; 113781d82fe4SHong Zhang /* create symbolic Bt */ 113881d82fe4SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 11390298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 114033d57670SJed Brown ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 114104b858e0SBarry Smith ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 114281d82fe4SHong Zhang 114381d82fe4SHong Zhang /* get symbolic C=A*Bt */ 114481d82fe4SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 114581d82fe4SHong Zhang 11462128a86cSHong Zhang /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */ 1147b00a9115SJed Brown ierr = PetscNew(&abt);CHKERRQ(ierr); 11484c7df5ccSHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 114940192850SHong Zhang c->abt = abt; 11502128a86cSHong Zhang 115140192850SHong Zhang abt->usecoloring = PETSC_FALSE; 115240192850SHong Zhang abt->destroy = (*C)->ops->destroy; 11532128a86cSHong Zhang (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 11542128a86cSHong Zhang 1155c5929fdfSBarry Smith ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr); 115640192850SHong Zhang if (abt->usecoloring) { 1157b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1158b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1159335efc43SPeter Brune MatColoring coloring; 11602128a86cSHong Zhang ISColoring iscoloring; 11612128a86cSHong Zhang Mat Bt_dense,C_dense; 11624d478ae7SHong Zhang Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data; 11634d478ae7SHong Zhang /* inode causes memory problem, don't know why */ 11644d478ae7SHong Zhang if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'"); 1165e8354b3eSHong Zhang 1166335efc43SPeter Brune ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr); 1167335efc43SPeter Brune ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1168335efc43SPeter Brune ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1169335efc43SPeter Brune ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1170335efc43SPeter Brune ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1171335efc43SPeter Brune ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 1172b9af6bddSHong Zhang ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr); 11732205254eSKarl Rupp 117440192850SHong Zhang abt->matcoloring = matcoloring; 11752205254eSKarl Rupp 11762128a86cSHong Zhang ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 11772128a86cSHong Zhang 11782128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 11792128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 11802128a86cSHong Zhang ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 11812128a86cSHong Zhang ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 11820298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 11832205254eSKarl Rupp 11842128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 118540192850SHong Zhang abt->Bt_den = Bt_dense; 11862128a86cSHong Zhang 11872128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 11882128a86cSHong Zhang ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 11892128a86cSHong Zhang ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 11900298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 11912205254eSKarl Rupp 11922128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 119340192850SHong Zhang abt->ABt_den = C_dense; 1194f94ccd6cSHong Zhang 1195f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 11961ce71dffSSatish Balay { 1197f94ccd6cSHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data; 1198c40ebe3bSHong Zhang ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr); 11991ce71dffSSatish Balay } 1200f94ccd6cSHong Zhang #endif 12012128a86cSHong Zhang } 1202e99be685SHong Zhang /* clean up */ 1203e99be685SHong Zhang ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1204e99be685SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12055df89d91SHong Zhang PetscFunctionReturn(0); 12065df89d91SHong Zhang } 12075df89d91SHong Zhang 12086fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 12095df89d91SHong Zhang { 12105df89d91SHong Zhang PetscErrorCode ierr; 12115df89d91SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1212e2cac8caSJed Brown PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 121381d82fe4SHong Zhang PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 12145df89d91SHong Zhang PetscLogDouble flops=0.0; 1215aa1aec99SHong Zhang MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 121640192850SHong Zhang Mat_MatMatTransMult *abt = c->abt; 12175df89d91SHong Zhang 12185df89d91SHong Zhang PetscFunctionBegin; 121958ed3ceaSHong Zhang /* clear old values in C */ 122058ed3ceaSHong Zhang if (!c->a) { 1221854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 122258ed3ceaSHong Zhang c->a = ca; 122358ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 122458ed3ceaSHong Zhang } else { 122558ed3ceaSHong Zhang ca = c->a; 122658ed3ceaSHong Zhang } 122758ed3ceaSHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 122858ed3ceaSHong Zhang 122940192850SHong Zhang if (abt->usecoloring) { 123040192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 123140192850SHong Zhang Mat Bt_dense,C_dense = abt->ABt_den; 1232c8db22f6SHong Zhang 1233b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 123440192850SHong Zhang Bt_dense = abt->Bt_den; 1235b9af6bddSHong Zhang ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1236c8db22f6SHong Zhang 1237c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 12382128a86cSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1239c8db22f6SHong Zhang 12402128a86cSHong Zhang /* Recover C from C_dense */ 1241b9af6bddSHong Zhang ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1242c8db22f6SHong Zhang PetscFunctionReturn(0); 1243c8db22f6SHong Zhang } 1244c8db22f6SHong Zhang 12451fa1209cSHong Zhang for (i=0; i<cm; i++) { 124681d82fe4SHong Zhang anzi = ai[i+1] - ai[i]; 124781d82fe4SHong Zhang acol = aj + ai[i]; 12486973769bSHong Zhang aval = aa + ai[i]; 12491fa1209cSHong Zhang cnzi = ci[i+1] - ci[i]; 12501fa1209cSHong Zhang ccol = cj + ci[i]; 12516973769bSHong Zhang cval = ca + ci[i]; 12521fa1209cSHong Zhang for (j=0; j<cnzi; j++) { 125381d82fe4SHong Zhang brow = ccol[j]; 125481d82fe4SHong Zhang bnzj = bi[brow+1] - bi[brow]; 125581d82fe4SHong Zhang bcol = bj + bi[brow]; 12566973769bSHong Zhang bval = ba + bi[brow]; 12576973769bSHong Zhang 125881d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 125981d82fe4SHong Zhang nexta = 0; nextb = 0; 126081d82fe4SHong Zhang while (nexta<anzi && nextb<bnzj) { 12617b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 126281d82fe4SHong Zhang if (nexta == anzi) break; 12637b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 126481d82fe4SHong Zhang if (nextb == bnzj) break; 126581d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 12666973769bSHong Zhang cval[j] += aval[nexta]*bval[nextb]; 126781d82fe4SHong Zhang nexta++; nextb++; 126881d82fe4SHong Zhang flops += 2; 12691fa1209cSHong Zhang } 12701fa1209cSHong Zhang } 127181d82fe4SHong Zhang } 127281d82fe4SHong Zhang } 127381d82fe4SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 127481d82fe4SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 127581d82fe4SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 12765df89d91SHong Zhang PetscFunctionReturn(0); 12775df89d91SHong Zhang } 12785df89d91SHong Zhang 12796d373c3eSHong Zhang PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A) 12806d373c3eSHong Zhang { 12816d373c3eSHong Zhang PetscErrorCode ierr; 12826d373c3eSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 12836d373c3eSHong Zhang Mat_MatTransMatMult *atb = a->atb; 12846d373c3eSHong Zhang 12856d373c3eSHong Zhang PetscFunctionBegin; 12866d373c3eSHong Zhang ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 12876d373c3eSHong Zhang ierr = (atb->destroy)(A);CHKERRQ(ierr); 12886d373c3eSHong Zhang ierr = PetscFree(atb);CHKERRQ(ierr); 12896d373c3eSHong Zhang PetscFunctionReturn(0); 12906d373c3eSHong Zhang } 12916d373c3eSHong Zhang 12920adebc6cSBarry Smith PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 12930adebc6cSBarry Smith { 12945df89d91SHong Zhang PetscErrorCode ierr; 12956d373c3eSHong Zhang const char *algTypes[2] = {"matmatmult","outerproduct"}; 12966d373c3eSHong Zhang PetscInt alg=0; /* set default algorithm */ 12976d373c3eSHong Zhang Mat At; 12986d373c3eSHong Zhang Mat_MatTransMatMult *atb; 12996d373c3eSHong Zhang Mat_SeqAIJ *c; 13005df89d91SHong Zhang 13015df89d91SHong Zhang PetscFunctionBegin; 13025df89d91SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 13036d373c3eSHong Zhang ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 13046d373c3eSHong Zhang PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */ 13056d373c3eSHong Zhang ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr); 13066d373c3eSHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 13076d373c3eSHong Zhang 13086d373c3eSHong Zhang switch (alg) { 13096d373c3eSHong Zhang case 1: 131075648e8dSHong Zhang ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 13116d373c3eSHong Zhang break; 13126d373c3eSHong Zhang default: 13136d373c3eSHong Zhang ierr = PetscNew(&atb);CHKERRQ(ierr); 13146d373c3eSHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 13156d373c3eSHong Zhang ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr); 13166d373c3eSHong Zhang 1317618cf492SHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 13186d373c3eSHong Zhang c->atb = atb; 13196d373c3eSHong Zhang atb->At = At; 13206d373c3eSHong Zhang atb->destroy = (*C)->ops->destroy; 13216d373c3eSHong Zhang (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 13226d373c3eSHong Zhang 13236d373c3eSHong Zhang break; 13245df89d91SHong Zhang } 13256d373c3eSHong Zhang } 13266d373c3eSHong Zhang if (alg) { 13276d373c3eSHong Zhang ierr = (*(*C)->ops->mattransposemultnumeric)(A,B,*C);CHKERRQ(ierr); 13286d373c3eSHong Zhang } else if (!alg && scall == MAT_REUSE_MATRIX) { 13296d373c3eSHong Zhang c = (Mat_SeqAIJ*)(*C)->data; 13306d373c3eSHong Zhang atb = c->atb; 13316d373c3eSHong Zhang At = atb->At; 13326d373c3eSHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 13336d373c3eSHong Zhang ierr = MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);CHKERRQ(ierr); 13346d373c3eSHong Zhang } 13355df89d91SHong Zhang PetscFunctionReturn(0); 13365df89d91SHong Zhang } 13375df89d91SHong Zhang 133875648e8dSHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C) 13395df89d91SHong Zhang { 1340bc011b1eSHong Zhang PetscErrorCode ierr; 1341bc011b1eSHong Zhang Mat At; 134238baddfdSBarry Smith PetscInt *ati,*atj; 1343bc011b1eSHong Zhang 1344bc011b1eSHong Zhang PetscFunctionBegin; 1345bc011b1eSHong Zhang /* create symbolic At */ 1346bc011b1eSHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13470298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 134833d57670SJed Brown ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 134904b858e0SBarry Smith ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1350bc011b1eSHong Zhang 1351bc011b1eSHong Zhang /* get symbolic C=At*B */ 1352bc011b1eSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr); 1353bc011b1eSHong Zhang 1354bc011b1eSHong Zhang /* clean up */ 13556bf464f9SBarry Smith ierr = MatDestroy(&At);CHKERRQ(ierr); 1356bc011b1eSHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13576d373c3eSHong Zhang 13586d373c3eSHong Zhang (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; 1359bc011b1eSHong Zhang PetscFunctionReturn(0); 1360bc011b1eSHong Zhang } 1361bc011b1eSHong Zhang 136275648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1363bc011b1eSHong Zhang { 1364bc011b1eSHong Zhang PetscErrorCode ierr; 13650fbc74f4SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1366d0f46423SBarry Smith PetscInt am =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1367d0f46423SBarry Smith PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1368d13dce4bSSatish Balay PetscLogDouble flops=0.0; 1369aa1aec99SHong Zhang MatScalar *aa =a->a,*ba,*ca,*caj; 1370bc011b1eSHong Zhang 1371bc011b1eSHong Zhang PetscFunctionBegin; 1372aa1aec99SHong Zhang if (!c->a) { 1373854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 13742205254eSKarl Rupp 1375aa1aec99SHong Zhang c->a = ca; 1376aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1377aa1aec99SHong Zhang } else { 1378aa1aec99SHong Zhang ca = c->a; 1379aa1aec99SHong Zhang } 1380bc011b1eSHong Zhang /* clear old values in C */ 1381bc011b1eSHong Zhang ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr); 1382bc011b1eSHong Zhang 1383bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1384bc011b1eSHong Zhang for (i=0; i<am; i++) { 1385bc011b1eSHong Zhang bj = b->j + bi[i]; 1386bc011b1eSHong Zhang ba = b->a + bi[i]; 1387bc011b1eSHong Zhang bnzi = bi[i+1] - bi[i]; 1388bc011b1eSHong Zhang anzi = ai[i+1] - ai[i]; 1389bc011b1eSHong Zhang for (j=0; j<anzi; j++) { 1390bc011b1eSHong Zhang nextb = 0; 13910fbc74f4SHong Zhang crow = *aj++; 1392bc011b1eSHong Zhang cjj = cj + ci[crow]; 1393bc011b1eSHong Zhang caj = ca + ci[crow]; 1394bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1395bc011b1eSHong Zhang for (k=0; nextb<bnzi; k++) { 13960fbc74f4SHong Zhang if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 13970fbc74f4SHong Zhang caj[k] += (*aa)*(*(ba+nextb)); 1398bc011b1eSHong Zhang nextb++; 1399bc011b1eSHong Zhang } 1400bc011b1eSHong Zhang } 1401bc011b1eSHong Zhang flops += 2*bnzi; 14020fbc74f4SHong Zhang aa++; 1403bc011b1eSHong Zhang } 1404bc011b1eSHong Zhang } 1405bc011b1eSHong Zhang 1406bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 1407bc011b1eSHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1408bc011b1eSHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1409bc011b1eSHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1410bc011b1eSHong Zhang PetscFunctionReturn(0); 1411bc011b1eSHong Zhang } 14129123193aSHong Zhang 1413150d2497SBarry Smith PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 14149123193aSHong Zhang { 14159123193aSHong Zhang PetscErrorCode ierr; 14169123193aSHong Zhang 14179123193aSHong Zhang PetscFunctionBegin; 14189123193aSHong Zhang if (scall == MAT_INITIAL_MATRIX) { 14193ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 14209123193aSHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr); 14213ff4c91cSHong Zhang ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 14229123193aSHong Zhang } 14233ff4c91cSHong Zhang ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 14249123193aSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr); 14254614e006SHong Zhang ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 14269123193aSHong Zhang PetscFunctionReturn(0); 14279123193aSHong Zhang } 1428edd81eecSMatthew Knepley 14299123193aSHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C) 14309123193aSHong Zhang { 14319123193aSHong Zhang PetscErrorCode ierr; 14329123193aSHong Zhang 14339123193aSHong Zhang PetscFunctionBegin; 14345a586d82SBarry Smith ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 14352205254eSKarl Rupp 1436d73949e8SHong Zhang (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 14379123193aSHong Zhang PetscFunctionReturn(0); 14389123193aSHong Zhang } 14399123193aSHong Zhang 14409123193aSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 14419123193aSHong Zhang { 1442f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1443612438f1SStefano Zampini Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 14449123193aSHong Zhang PetscErrorCode ierr; 1445daf57211SHong Zhang PetscScalar *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp; 1446daf57211SHong Zhang const PetscScalar *aa,*b1,*b2,*b3,*b4; 1447daf57211SHong Zhang const PetscInt *aj; 1448612438f1SStefano Zampini PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 1449daf57211SHong Zhang PetscInt am4=4*am,bm4=4*bm,col,i,j,n,ajtmp; 14509123193aSHong Zhang 14519123193aSHong Zhang PetscFunctionBegin; 1452f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 1453612438f1SStefano Zampini if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 1454e32f2f54SBarry Smith if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 1455e32f2f54SBarry Smith if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 1456612438f1SStefano Zampini b = bd->v; 14578c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1458f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 1459730858b9SHong Zhang c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am; 1460f32d5d43SBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 1461f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1462f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1463f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1464f32d5d43SBarry Smith aj = a->j + a->i[i]; 1465f32d5d43SBarry Smith aa = a->a + a->i[i]; 1466f32d5d43SBarry Smith for (j=0; j<n; j++) { 1467730858b9SHong Zhang aatmp = aa[j]; ajtmp = aj[j]; 1468730858b9SHong Zhang r1 += aatmp*b1[ajtmp]; 1469730858b9SHong Zhang r2 += aatmp*b2[ajtmp]; 1470730858b9SHong Zhang r3 += aatmp*b3[ajtmp]; 1471730858b9SHong Zhang r4 += aatmp*b4[ajtmp]; 14729123193aSHong Zhang } 1473730858b9SHong Zhang c1[i] = r1; 1474730858b9SHong Zhang c2[i] = r2; 1475730858b9SHong Zhang c3[i] = r3; 1476730858b9SHong Zhang c4[i] = r4; 1477f32d5d43SBarry Smith } 1478730858b9SHong Zhang b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1479730858b9SHong Zhang c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1480f32d5d43SBarry Smith } 1481f32d5d43SBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 1482f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1483f32d5d43SBarry Smith r1 = 0.0; 1484f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1485f32d5d43SBarry Smith aj = a->j + a->i[i]; 1486f32d5d43SBarry Smith aa = a->a + a->i[i]; 1487f32d5d43SBarry Smith for (j=0; j<n; j++) { 1488730858b9SHong Zhang r1 += aa[j]*b1[aj[j]]; 1489f32d5d43SBarry Smith } 1490730858b9SHong Zhang c1[i] = r1; 1491f32d5d43SBarry Smith } 1492f32d5d43SBarry Smith b1 += bm; 1493730858b9SHong Zhang c1 += am; 1494f32d5d43SBarry Smith } 1495dc0b31edSSatish Balay ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 14968c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1497f32d5d43SBarry Smith ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1498f32d5d43SBarry Smith ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1499f32d5d43SBarry Smith PetscFunctionReturn(0); 1500f32d5d43SBarry Smith } 1501f32d5d43SBarry Smith 1502f32d5d43SBarry Smith /* 15034324174eSBarry Smith Note very similar to MatMult_SeqAIJ(), should generate both codes from same base 1504f32d5d43SBarry Smith */ 1505f32d5d43SBarry Smith PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1506f32d5d43SBarry Smith { 1507f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 150890f5ea3eSStefano Zampini Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1509f32d5d43SBarry Smith PetscErrorCode ierr; 1510dd6ea824SBarry Smith PetscScalar *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4; 1511dd6ea824SBarry Smith MatScalar *aa; 151290f5ea3eSStefano Zampini PetscInt cm = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm; 15134324174eSBarry Smith PetscInt am2 = 2*am, am3 = 3*am, bm4 = 4*bm,colam,*ridx; 1514f32d5d43SBarry Smith 1515f32d5d43SBarry Smith PetscFunctionBegin; 1516f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 151790f5ea3eSStefano Zampini b = bd->v; 15188c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1519f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 15204324174eSBarry Smith 15214324174eSBarry Smith if (a->compressedrow.use) { /* use compressed row format */ 15224324174eSBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 15234324174eSBarry Smith colam = col*am; 15244324174eSBarry Smith arm = a->compressedrow.nrows; 15254324174eSBarry Smith ii = a->compressedrow.i; 15264324174eSBarry Smith ridx = a->compressedrow.rindex; 15274324174eSBarry Smith for (i=0; i<arm; i++) { /* over rows of C in those columns */ 15284324174eSBarry Smith r1 = r2 = r3 = r4 = 0.0; 15294324174eSBarry Smith n = ii[i+1] - ii[i]; 15304324174eSBarry Smith aj = a->j + ii[i]; 15314324174eSBarry Smith aa = a->a + ii[i]; 15324324174eSBarry Smith for (j=0; j<n; j++) { 15334324174eSBarry Smith r1 += (*aa)*b1[*aj]; 15344324174eSBarry Smith r2 += (*aa)*b2[*aj]; 15354324174eSBarry Smith r3 += (*aa)*b3[*aj]; 15364324174eSBarry Smith r4 += (*aa++)*b4[*aj++]; 15374324174eSBarry Smith } 15384324174eSBarry Smith c[colam + ridx[i]] += r1; 15394324174eSBarry Smith c[colam + am + ridx[i]] += r2; 15404324174eSBarry Smith c[colam + am2 + ridx[i]] += r3; 15414324174eSBarry Smith c[colam + am3 + ridx[i]] += r4; 15424324174eSBarry Smith } 15434324174eSBarry Smith b1 += bm4; 15444324174eSBarry Smith b2 += bm4; 15454324174eSBarry Smith b3 += bm4; 15464324174eSBarry Smith b4 += bm4; 15474324174eSBarry Smith } 15484324174eSBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 15494324174eSBarry Smith colam = col*am; 15504324174eSBarry Smith arm = a->compressedrow.nrows; 15514324174eSBarry Smith ii = a->compressedrow.i; 15524324174eSBarry Smith ridx = a->compressedrow.rindex; 15534324174eSBarry Smith for (i=0; i<arm; i++) { /* over rows of C in those columns */ 15544324174eSBarry Smith r1 = 0.0; 15554324174eSBarry Smith n = ii[i+1] - ii[i]; 15564324174eSBarry Smith aj = a->j + ii[i]; 15574324174eSBarry Smith aa = a->a + ii[i]; 15584324174eSBarry Smith 15594324174eSBarry Smith for (j=0; j<n; j++) { 15604324174eSBarry Smith r1 += (*aa++)*b1[*aj++]; 15614324174eSBarry Smith } 1562a2ea699eSBarry Smith c[colam + ridx[i]] += r1; 15634324174eSBarry Smith } 15644324174eSBarry Smith b1 += bm; 15654324174eSBarry Smith } 15664324174eSBarry Smith } else { 1567f32d5d43SBarry Smith for (col=0; col<cn-4; col += 4) { /* over columns of C */ 1568f32d5d43SBarry Smith colam = col*am; 1569f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1570f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1571f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1572f32d5d43SBarry Smith aj = a->j + a->i[i]; 1573f32d5d43SBarry Smith aa = a->a + a->i[i]; 1574f32d5d43SBarry Smith for (j=0; j<n; j++) { 1575f32d5d43SBarry Smith r1 += (*aa)*b1[*aj]; 1576f32d5d43SBarry Smith r2 += (*aa)*b2[*aj]; 1577f32d5d43SBarry Smith r3 += (*aa)*b3[*aj]; 1578f32d5d43SBarry Smith r4 += (*aa++)*b4[*aj++]; 1579f32d5d43SBarry Smith } 1580f32d5d43SBarry Smith c[colam + i] += r1; 1581f32d5d43SBarry Smith c[colam + am + i] += r2; 1582f32d5d43SBarry Smith c[colam + am2 + i] += r3; 1583f32d5d43SBarry Smith c[colam + am3 + i] += r4; 1584f32d5d43SBarry Smith } 1585f32d5d43SBarry Smith b1 += bm4; 1586f32d5d43SBarry Smith b2 += bm4; 1587f32d5d43SBarry Smith b3 += bm4; 1588f32d5d43SBarry Smith b4 += bm4; 1589f32d5d43SBarry Smith } 1590f32d5d43SBarry Smith for (; col<cn; col++) { /* over extra columns of C */ 1591a2ea699eSBarry Smith colam = col*am; 1592f32d5d43SBarry Smith for (i=0; i<am; i++) { /* over rows of C in those columns */ 1593f32d5d43SBarry Smith r1 = 0.0; 1594f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1595f32d5d43SBarry Smith aj = a->j + a->i[i]; 1596f32d5d43SBarry Smith aa = a->a + a->i[i]; 1597f32d5d43SBarry Smith 1598f32d5d43SBarry Smith for (j=0; j<n; j++) { 1599f32d5d43SBarry Smith r1 += (*aa++)*b1[*aj++]; 1600f32d5d43SBarry Smith } 1601a2ea699eSBarry Smith c[colam + i] += r1; 1602f32d5d43SBarry Smith } 1603f32d5d43SBarry Smith b1 += bm; 1604f32d5d43SBarry Smith } 16054324174eSBarry Smith } 1606dc0b31edSSatish Balay ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr); 16078c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 16089123193aSHong Zhang PetscFunctionReturn(0); 16099123193aSHong Zhang } 1610b1683b59SHong Zhang 1611b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1612c8db22f6SHong Zhang { 1613c8db22f6SHong Zhang PetscErrorCode ierr; 16142f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 16152f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 16162f5f1f90SHong Zhang PetscInt *bi = b->i,*bj=b->j; 16172f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 16182f5f1f90SHong Zhang MatScalar *btval,*btval_den,*ba=b->a; 16192f5f1f90SHong Zhang PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1620c8db22f6SHong Zhang 1621c8db22f6SHong Zhang PetscFunctionBegin; 16222f5f1f90SHong Zhang btval_den=btdense->v; 16232f5f1f90SHong Zhang ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr); 16242f5f1f90SHong Zhang for (k=0; k<ncolors; k++) { 16252f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 16262f5f1f90SHong Zhang for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1627d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 16282f5f1f90SHong Zhang btcol = bj + bi[col]; 16292f5f1f90SHong Zhang btval = ba + bi[col]; 16302f5f1f90SHong Zhang anz = bi[col+1] - bi[col]; 1631d2853540SHong Zhang for (j=0; j<anz; j++) { 16322f5f1f90SHong Zhang brow = btcol[j]; 16332f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1634c8db22f6SHong Zhang } 1635c8db22f6SHong Zhang } 16362f5f1f90SHong Zhang btval_den += m; 1637c8db22f6SHong Zhang } 1638c8db22f6SHong Zhang PetscFunctionReturn(0); 1639c8db22f6SHong Zhang } 1640c8db22f6SHong Zhang 1641b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 16428972f759SHong Zhang { 16438972f759SHong Zhang PetscErrorCode ierr; 1644b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 1645077f23c2SHong Zhang PetscScalar *ca_den,*ca_den_ptr,*ca=csp->a; 1646f99a636bSHong Zhang PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1647e88777f2SHong Zhang PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1648077f23c2SHong Zhang PetscInt nrows,*row,*idx; 1649077f23c2SHong Zhang PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 16508972f759SHong Zhang 16518972f759SHong Zhang PetscFunctionBegin; 1652a3fe58edSHong Zhang ierr = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr); 1653f99a636bSHong Zhang 1654077f23c2SHong Zhang if (brows > 0) { 1655077f23c2SHong Zhang PetscInt *lstart,row_end,row_start; 1656980ae229SHong Zhang lstart = matcoloring->lstart; 1657eeb4fd02SHong Zhang ierr = PetscMemzero(lstart,ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1658eeb4fd02SHong Zhang 1659077f23c2SHong Zhang row_end = brows; 1660eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1661077f23c2SHong Zhang for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1662077f23c2SHong Zhang ca_den_ptr = ca_den; 1663980ae229SHong Zhang for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1664eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1665eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1666eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1667eeb4fd02SHong Zhang for (l=lstart[k]; l<nrows; l++) { 1668eeb4fd02SHong Zhang if (row[l] >= row_end) { 1669eeb4fd02SHong Zhang lstart[k] = l; 1670eeb4fd02SHong Zhang break; 1671eeb4fd02SHong Zhang } else { 1672077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1673eeb4fd02SHong Zhang } 1674eeb4fd02SHong Zhang } 1675077f23c2SHong Zhang ca_den_ptr += m; 1676eeb4fd02SHong Zhang } 1677077f23c2SHong Zhang row_end += brows; 1678eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1679eeb4fd02SHong Zhang } 1680077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1681077f23c2SHong Zhang ca_den_ptr = ca_den; 1682077f23c2SHong Zhang for (k=0; k<ncolors; k++) { 1683077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1684077f23c2SHong Zhang row = rows + colorforrow[k]; 1685077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1686077f23c2SHong Zhang for (l=0; l<nrows; l++) { 1687077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1688077f23c2SHong Zhang } 1689077f23c2SHong Zhang ca_den_ptr += m; 1690077f23c2SHong Zhang } 1691f99a636bSHong Zhang } 1692f99a636bSHong Zhang 1693a3fe58edSHong Zhang ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr); 1694f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1695077f23c2SHong Zhang if (matcoloring->brows > 0) { 1696f99a636bSHong Zhang ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr); 1697e88777f2SHong Zhang } else { 1698077f23c2SHong Zhang ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1699e88777f2SHong Zhang } 1700f99a636bSHong Zhang #endif 17018972f759SHong Zhang PetscFunctionReturn(0); 17028972f759SHong Zhang } 17038972f759SHong Zhang 1704b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1705b1683b59SHong Zhang { 1706b1683b59SHong Zhang PetscErrorCode ierr; 1707e88777f2SHong Zhang PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 17081a83f524SJed Brown const PetscInt *is,*ci,*cj,*row_idx; 1709b28d80bdSHong Zhang PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1710b1683b59SHong Zhang IS *isa; 1711b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1712e88777f2SHong Zhang PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1713e88777f2SHong Zhang PetscInt *colorforcol,*columns,*columns_i,brows; 1714e88777f2SHong Zhang PetscBool flg; 1715b1683b59SHong Zhang 1716b1683b59SHong Zhang PetscFunctionBegin; 1717b1683b59SHong Zhang ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1718e99be685SHong Zhang 17197ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1720e88777f2SHong Zhang Nbs = mat->cmap->N/bs; 1721b1683b59SHong Zhang c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1722e88777f2SHong Zhang c->N = Nbs; 1723e88777f2SHong Zhang c->m = c->M; 1724b1683b59SHong Zhang c->rstart = 0; 1725e88777f2SHong Zhang c->brows = 100; 1726b1683b59SHong Zhang 1727b1683b59SHong Zhang c->ncolors = nis; 1728dcca6d9dSJed Brown ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1729854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1730854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1731e88777f2SHong Zhang 1732e88777f2SHong Zhang brows = c->brows; 1733c5929fdfSBarry Smith ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1734e88777f2SHong Zhang if (flg) c->brows = brows; 1735eeb4fd02SHong Zhang if (brows > 0) { 1736854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1737e88777f2SHong Zhang } 17382205254eSKarl Rupp 1739d2853540SHong Zhang colorforrow[0] = 0; 1740d2853540SHong Zhang rows_i = rows; 1741f99a636bSHong Zhang den2sp_i = den2sp; 1742b1683b59SHong Zhang 1743854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1744854ce69bSBarry Smith ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 17452205254eSKarl Rupp 1746d2853540SHong Zhang colorforcol[0] = 0; 1747d2853540SHong Zhang columns_i = columns; 1748d2853540SHong Zhang 1749077f23c2SHong Zhang /* get column-wise storage of mat */ 1750077f23c2SHong Zhang ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1751b1683b59SHong Zhang 1752b28d80bdSHong Zhang cm = c->m; 1753854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1754854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1755f99a636bSHong Zhang for (i=0; i<nis; i++) { /* loop over color */ 1756b1683b59SHong Zhang ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1757b1683b59SHong Zhang ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 17582205254eSKarl Rupp 1759b1683b59SHong Zhang c->ncolumns[i] = n; 1760b1683b59SHong Zhang if (n) { 1761d2853540SHong Zhang ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr); 1762b1683b59SHong Zhang } 1763d2853540SHong Zhang colorforcol[i+1] = colorforcol[i] + n; 1764d2853540SHong Zhang columns_i += n; 1765b1683b59SHong Zhang 1766b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 1767e8354b3eSHong Zhang ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr); 1768e99be685SHong Zhang 1769b7caf3d6SHong Zhang for (j=0; j<n; j++) { /* loop over columns*/ 1770b1683b59SHong Zhang col = is[j]; 1771d2853540SHong Zhang row_idx = cj + ci[col]; 1772b1683b59SHong Zhang m = ci[col+1] - ci[col]; 1773b7caf3d6SHong Zhang for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1774e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1775d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1776b1683b59SHong Zhang } 1777b1683b59SHong Zhang } 1778b1683b59SHong Zhang /* count the number of hits */ 1779b1683b59SHong Zhang nrows = 0; 1780e8354b3eSHong Zhang for (j=0; j<cm; j++) { 1781b1683b59SHong Zhang if (rowhit[j]) nrows++; 1782b1683b59SHong Zhang } 1783b1683b59SHong Zhang c->nrows[i] = nrows; 1784d2853540SHong Zhang colorforrow[i+1] = colorforrow[i] + nrows; 1785d2853540SHong Zhang 1786b1683b59SHong Zhang nrows = 0; 1787b7caf3d6SHong Zhang for (j=0; j<cm; j++) { /* loop over rows */ 1788b1683b59SHong Zhang if (rowhit[j]) { 1789d2853540SHong Zhang rows_i[nrows] = j; 179012b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1791b1683b59SHong Zhang nrows++; 1792b1683b59SHong Zhang } 1793b1683b59SHong Zhang } 1794e88777f2SHong Zhang den2sp_i += nrows; 1795077f23c2SHong Zhang 1796b1683b59SHong Zhang ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1797f99a636bSHong Zhang rows_i += nrows; 1798b1683b59SHong Zhang } 17990298fd71SBarry Smith ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1800b28d80bdSHong Zhang ierr = PetscFree(rowhit);CHKERRQ(ierr); 1801b1683b59SHong Zhang ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr); 1802d2853540SHong Zhang if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]); 1803b28d80bdSHong Zhang 1804d2853540SHong Zhang c->colorforrow = colorforrow; 1805d2853540SHong Zhang c->rows = rows; 1806f99a636bSHong Zhang c->den2sp = den2sp; 1807d2853540SHong Zhang c->colorforcol = colorforcol; 1808d2853540SHong Zhang c->columns = columns; 18092205254eSKarl Rupp 1810f94ccd6cSHong Zhang ierr = PetscFree(idxhit);CHKERRQ(ierr); 1811b1683b59SHong Zhang PetscFunctionReturn(0); 1812b1683b59SHong Zhang } 1813d013fc79Sandi selinger 181473b67375Sandi selinger /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */ 1815d013fc79Sandi selinger PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C) 1816d013fc79Sandi selinger { 1817d013fc79Sandi selinger PetscErrorCode ierr; 1818d013fc79Sandi selinger PetscLogDouble flops=0.0; 1819d013fc79Sandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 18202869b61bSandi selinger const PetscInt *ai=a->i,*bi=b->i; 1821d013fc79Sandi selinger PetscInt *ci,*cj,*cj_i; 1822d013fc79Sandi selinger PetscScalar *ca,*ca_i; 18232869b61bSandi selinger PetscInt b_maxmemrow,c_maxmem,a_col; 1824d013fc79Sandi selinger PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1825d013fc79Sandi selinger PetscInt i,k,ndouble=0; 1826d013fc79Sandi selinger PetscReal afill; 1827d013fc79Sandi selinger PetscScalar *c_row_val_dense; 1828d013fc79Sandi selinger PetscBool *c_row_idx_flags; 1829d013fc79Sandi selinger PetscInt *aj_i=a->j; 1830d013fc79Sandi selinger PetscScalar *aa_i=a->a; 1831d013fc79Sandi selinger 1832d013fc79Sandi selinger PetscFunctionBegin; 18332869b61bSandi selinger 18342869b61bSandi selinger /* Step 1: Determine upper bounds on memory for C and allocate memory */ 18352869b61bSandi selinger /* This should be enough for almost all matrices. If still more memory is needed, it is reallocated later. */ 18362869b61bSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 18372869b61bSandi selinger b_maxmemrow = PetscMin(bi[bm],bn); 1838d013fc79Sandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1839d013fc79Sandi selinger ierr = PetscMalloc1(bn,&c_row_val_dense);CHKERRQ(ierr); 1840d013fc79Sandi selinger ierr = PetscMalloc1(bn,&c_row_idx_flags);CHKERRQ(ierr); 1841d013fc79Sandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 1842d013fc79Sandi selinger ierr = PetscMalloc1(c_maxmem,&ca);CHKERRQ(ierr); 1843d013fc79Sandi selinger ca_i = ca; 1844d013fc79Sandi selinger cj_i = cj; 1845d013fc79Sandi selinger ci[0] = 0; 184673b67375Sandi selinger ierr = PetscMemzero(c_row_val_dense,bn*sizeof(PetscScalar));CHKERRQ(ierr); 184773b67375Sandi selinger ierr = PetscMemzero(c_row_idx_flags,bn*sizeof(PetscBool));CHKERRQ(ierr); 1848d013fc79Sandi selinger for (i=0; i<am; i++) { 1849d013fc79Sandi selinger /* Step 2: Initialize the dense row vector for C */ 1850d013fc79Sandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1851d013fc79Sandi selinger PetscInt cnzi = 0; 1852d013fc79Sandi selinger PetscInt *bj_i; 1853d013fc79Sandi selinger PetscScalar *ba_i; 18542869b61bSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory 18552869b61bSandi selinger Usually, there is enough memory in the first place, so this is not executed. */ 18562869b61bSandi selinger while (ci[i] + b_maxmemrow > c_maxmem) { 18572869b61bSandi selinger c_maxmem *= 2; 18582869b61bSandi selinger ndouble++; 18592869b61bSandi selinger ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj); 18602869b61bSandi selinger ierr = PetscRealloc(sizeof(PetscScalar)*c_maxmem,&ca); 18612869b61bSandi selinger } 1862d013fc79Sandi selinger 1863d013fc79Sandi selinger /* Step 3: Do the numerical calculations */ 1864d013fc79Sandi selinger for (a_col=0; a_col<anzi; a_col++) { /* iterate over all non zero values in a row of A */ 1865d013fc79Sandi selinger PetscInt a_col_index = aj_i[a_col]; 1866d013fc79Sandi selinger const PetscInt bnzi = bi[a_col_index+1] - bi[a_col_index]; 1867d013fc79Sandi selinger flops += 2*bnzi; 1868d013fc79Sandi selinger bj_i = b->j + bi[a_col_index]; /* points to the current row in bj */ 1869d013fc79Sandi selinger ba_i = b->a + bi[a_col_index]; /* points to the current row in ba */ 1870d013fc79Sandi selinger for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */ 1871d013fc79Sandi selinger if (c_row_idx_flags[bj_i[k]] == PETSC_FALSE) { 18722869b61bSandi selinger cj_i[cnzi++] = bj_i[k]; 1873d013fc79Sandi selinger c_row_idx_flags[bj_i[k]] = PETSC_TRUE; 1874d013fc79Sandi selinger } 1875d013fc79Sandi selinger c_row_val_dense[bj_i[k]] += aa_i[a_col] * ba_i[k]; 1876d013fc79Sandi selinger } 1877d013fc79Sandi selinger } 1878d013fc79Sandi selinger 1879d013fc79Sandi selinger /* Sort array */ 18803353a62bSKarl Rupp ierr = PetscSortInt(cnzi,cj_i);CHKERRQ(ierr); 1881d013fc79Sandi selinger /* Step 4 */ 1882d013fc79Sandi selinger for (k=0; k<cnzi; k++) { 1883d013fc79Sandi selinger ca_i[k] = c_row_val_dense[cj_i[k]]; 1884d013fc79Sandi selinger c_row_val_dense[cj_i[k]] = 0.; 1885d013fc79Sandi selinger c_row_idx_flags[cj_i[k]] = PETSC_FALSE; 1886d013fc79Sandi selinger } 1887d013fc79Sandi selinger /* terminate current row */ 1888d013fc79Sandi selinger aa_i += anzi; 1889d013fc79Sandi selinger aj_i += anzi; 1890d013fc79Sandi selinger ca_i += cnzi; 1891d013fc79Sandi selinger cj_i += cnzi; 1892d013fc79Sandi selinger ci[i+1] = ci[i] + cnzi; 1893d013fc79Sandi selinger flops += cnzi; 1894d013fc79Sandi selinger } 1895d013fc79Sandi selinger 1896d013fc79Sandi selinger /* Step 5 */ 1897d013fc79Sandi selinger /* Create the new matrix */ 1898d013fc79Sandi selinger ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr); 1899d013fc79Sandi selinger ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 190002fe1965SBarry Smith ierr = MatSetType(*C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1901d013fc79Sandi selinger 1902d013fc79Sandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1903d013fc79Sandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1904d013fc79Sandi selinger c = (Mat_SeqAIJ*)((*C)->data); 1905d013fc79Sandi selinger c->a = ca; 1906d013fc79Sandi selinger c->free_a = PETSC_TRUE; 1907d013fc79Sandi selinger c->free_ij = PETSC_TRUE; 1908d013fc79Sandi selinger c->nonew = 0; 1909d013fc79Sandi selinger 1910d013fc79Sandi selinger /* set MatInfo */ 1911d013fc79Sandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1912d013fc79Sandi selinger if (afill < 1.0) afill = 1.0; 1913d013fc79Sandi selinger c->maxnz = ci[am]; 1914d013fc79Sandi selinger c->nz = ci[am]; 1915d013fc79Sandi selinger (*C)->info.mallocs = ndouble; 1916d013fc79Sandi selinger (*C)->info.fill_ratio_given = fill; 1917d013fc79Sandi selinger (*C)->info.fill_ratio_needed = afill; 1918d013fc79Sandi selinger 191973b67375Sandi selinger ierr = PetscFree(c_row_val_dense);CHKERRQ(ierr); 192073b67375Sandi selinger ierr = PetscFree(c_row_idx_flags);CHKERRQ(ierr); 1921d013fc79Sandi selinger 1922d013fc79Sandi selinger ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1923d013fc79Sandi selinger ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1924d013fc79Sandi selinger ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1925d013fc79Sandi selinger PetscFunctionReturn(0); 1926d013fc79Sandi selinger } 1927