1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 14df97dc6dSFande Kong { 15df97dc6dSFande Kong PetscErrorCode ierr; 16df97dc6dSFande Kong 17df97dc6dSFande Kong PetscFunctionBegin; 18df97dc6dSFande Kong if (C->ops->matmultnumeric) { 19*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call"); 20df97dc6dSFande Kong ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr); 21df97dc6dSFande Kong } else { 22df97dc6dSFande Kong ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr); 23df97dc6dSFande Kong } 24df97dc6dSFande Kong PetscFunctionReturn(0); 25df97dc6dSFande Kong } 26df97dc6dSFande Kong 274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */ 28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat) 29df97dc6dSFande Kong { 30df97dc6dSFande Kong PetscErrorCode ierr; 314222ddf1SHong Zhang PetscInt ii; 324222ddf1SHong Zhang Mat_SeqAIJ *aij; 33e4e71118SStefano Zampini PetscBool isseqaij; 345c66b693SKris Buschelman 355c66b693SKris Buschelman PetscFunctionBegin; 36*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(m > 0 && i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0"); 374222ddf1SHong Zhang ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr); 384222ddf1SHong Zhang 39e4e71118SStefano Zampini if (!mtype) { 40e4e71118SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 41e4e71118SStefano Zampini if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); } 42e4e71118SStefano Zampini } else { 43e4e71118SStefano Zampini ierr = MatSetType(mat,mtype);CHKERRQ(ierr); 44e4e71118SStefano Zampini } 45f4259b30SLisandro Dalcin ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 464222ddf1SHong Zhang aij = (Mat_SeqAIJ*)(mat)->data; 474222ddf1SHong Zhang ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr); 484222ddf1SHong Zhang ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr); 494222ddf1SHong Zhang 504222ddf1SHong Zhang aij->i = i; 514222ddf1SHong Zhang aij->j = j; 524222ddf1SHong Zhang aij->a = a; 534222ddf1SHong Zhang aij->singlemalloc = PETSC_FALSE; 544222ddf1SHong Zhang aij->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 554222ddf1SHong Zhang aij->free_a = PETSC_FALSE; 564222ddf1SHong Zhang aij->free_ij = PETSC_FALSE; 574222ddf1SHong Zhang 584222ddf1SHong Zhang for (ii=0; ii<m; ii++) { 594222ddf1SHong Zhang aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii]; 6025023028SHong Zhang } 615c66b693SKris Buschelman PetscFunctionReturn(0); 625c66b693SKris Buschelman } 631c24bd37SHong Zhang 644222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 654222ddf1SHong Zhang { 664222ddf1SHong Zhang PetscErrorCode ierr; 674222ddf1SHong Zhang Mat_Product *product = C->product; 684222ddf1SHong Zhang MatProductAlgorithm alg; 694222ddf1SHong Zhang PetscBool flg; 704222ddf1SHong Zhang 714222ddf1SHong Zhang PetscFunctionBegin; 724222ddf1SHong Zhang if (product) { 734222ddf1SHong Zhang alg = product->alg; 744222ddf1SHong Zhang } else { 754222ddf1SHong Zhang alg = "sorted"; 764222ddf1SHong Zhang } 774222ddf1SHong Zhang /* sorted */ 784222ddf1SHong Zhang ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr); 794222ddf1SHong Zhang if (flg) { 804222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr); 814222ddf1SHong Zhang PetscFunctionReturn(0); 824222ddf1SHong Zhang } 834222ddf1SHong Zhang 844222ddf1SHong Zhang /* scalable */ 854222ddf1SHong Zhang ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 864222ddf1SHong Zhang if (flg) { 874222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 884222ddf1SHong Zhang PetscFunctionReturn(0); 894222ddf1SHong Zhang } 904222ddf1SHong Zhang 914222ddf1SHong Zhang /* scalable_fast */ 924222ddf1SHong Zhang ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr); 934222ddf1SHong Zhang if (flg) { 944222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 954222ddf1SHong Zhang PetscFunctionReturn(0); 964222ddf1SHong Zhang } 974222ddf1SHong Zhang 984222ddf1SHong Zhang /* heap */ 994222ddf1SHong Zhang ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr); 1004222ddf1SHong Zhang if (flg) { 1014222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 1024222ddf1SHong Zhang PetscFunctionReturn(0); 1034222ddf1SHong Zhang } 1044222ddf1SHong Zhang 1054222ddf1SHong Zhang /* btheap */ 1064222ddf1SHong Zhang ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr); 1074222ddf1SHong Zhang if (flg) { 1084222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 1094222ddf1SHong Zhang PetscFunctionReturn(0); 1104222ddf1SHong Zhang } 1114222ddf1SHong Zhang 1124222ddf1SHong Zhang /* llcondensed */ 1134222ddf1SHong Zhang ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr); 1144222ddf1SHong Zhang if (flg) { 1154222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 1164222ddf1SHong Zhang PetscFunctionReturn(0); 1174222ddf1SHong Zhang } 1184222ddf1SHong Zhang 1194222ddf1SHong Zhang /* rowmerge */ 1204222ddf1SHong Zhang ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr); 1214222ddf1SHong Zhang if (flg) { 1224222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 1234222ddf1SHong Zhang PetscFunctionReturn(0); 1244222ddf1SHong Zhang } 1254222ddf1SHong Zhang 1264222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE) 1274222ddf1SHong Zhang ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 1284222ddf1SHong Zhang if (flg) { 1294222ddf1SHong Zhang ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 1304222ddf1SHong Zhang PetscFunctionReturn(0); 1314222ddf1SHong Zhang } 1324222ddf1SHong Zhang #endif 1334222ddf1SHong Zhang 1344222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1354222ddf1SHong Zhang } 1364222ddf1SHong Zhang 1374222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C) 138b561aa9dSHong Zhang { 139b561aa9dSHong Zhang PetscErrorCode ierr; 140b561aa9dSHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 141971236abSHong Zhang PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 142c1ba5575SJed Brown PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 143b561aa9dSHong Zhang PetscReal afill; 144eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 145eca6b86aSHong Zhang PetscTable ta; 146fb908031SHong Zhang PetscBT lnkbt; 1470298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 148b561aa9dSHong Zhang 149b561aa9dSHong Zhang PetscFunctionBegin; 150bd958071SHong Zhang /* Get ci and cj */ 151bd958071SHong Zhang /*---------------*/ 152fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 153fb908031SHong Zhang /* free space for accumulating nonzero column info */ 154854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 155fb908031SHong Zhang ci[0] = 0; 156fb908031SHong Zhang 157fb908031SHong Zhang /* create and initialize a linked list */ 158c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 159c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 160eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 161eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 162eca6b86aSHong Zhang 163eca6b86aSHong Zhang ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 164fb908031SHong Zhang 165fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 166f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 1672205254eSKarl Rupp 168fb908031SHong Zhang current_space = free_space; 169fb908031SHong Zhang 170fb908031SHong Zhang /* Determine ci and cj */ 171fb908031SHong Zhang for (i=0; i<am; i++) { 172fb908031SHong Zhang anzi = ai[i+1] - ai[i]; 173fb908031SHong Zhang aj = a->j + ai[i]; 174fb908031SHong Zhang for (j=0; j<anzi; j++) { 175fb908031SHong Zhang brow = aj[j]; 176fb908031SHong Zhang bnzj = bi[brow+1] - bi[brow]; 177fb908031SHong Zhang bj = b->j + bi[brow]; 178fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 179fb908031SHong Zhang ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 180fb908031SHong Zhang } 1818c78258cSHong Zhang /* add possible missing diagonal entry */ 1828c78258cSHong Zhang if (C->force_diagonals) { 1838c78258cSHong Zhang ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr); 1848c78258cSHong Zhang } 185fb908031SHong Zhang cnzi = lnk[0]; 186fb908031SHong Zhang 187fb908031SHong Zhang /* If free space is not available, make more free space */ 188fb908031SHong Zhang /* Double the amount of total space in the list */ 189fb908031SHong Zhang if (current_space->local_remaining<cnzi) { 190f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 191fb908031SHong Zhang ndouble++; 192fb908031SHong Zhang } 193fb908031SHong Zhang 194fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 195fb908031SHong Zhang ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1962205254eSKarl Rupp 197fb908031SHong Zhang current_space->array += cnzi; 198fb908031SHong Zhang current_space->local_used += cnzi; 199fb908031SHong Zhang current_space->local_remaining -= cnzi; 2002205254eSKarl Rupp 201fb908031SHong Zhang ci[i+1] = ci[i] + cnzi; 202fb908031SHong Zhang } 203fb908031SHong Zhang 204fb908031SHong Zhang /* Column indices are in the list of free space */ 205fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 206fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 207854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 208fb908031SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 209fb908031SHong Zhang ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 210b561aa9dSHong Zhang 21126be0446SHong Zhang /* put together the new symbolic matrix */ 212e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 2134222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 21458c24d83SHong Zhang 21558c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 21658c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 2174222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 218aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 219e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 22058c24d83SHong Zhang c->nonew = 0; 2214222ddf1SHong Zhang 2224222ddf1SHong Zhang /* fast, needs non-scalable O(bn) array 'abdense' */ 2234222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 2240b7e3e3dSHong Zhang 2257212da7cSHong Zhang /* set MatInfo */ 2267212da7cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 227f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 2287212da7cSHong Zhang c->maxnz = ci[am]; 2297212da7cSHong Zhang c->nz = ci[am]; 2304222ddf1SHong Zhang C->info.mallocs = ndouble; 2314222ddf1SHong Zhang C->info.fill_ratio_given = fill; 2324222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 2337212da7cSHong Zhang 2347212da7cSHong Zhang #if defined(PETSC_USE_INFO) 2357212da7cSHong Zhang if (ci[am]) { 2367d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 2377d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 238f2b054eeSHong Zhang } else { 2394222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 240be0fcf8dSHong Zhang } 241f2b054eeSHong Zhang #endif 24258c24d83SHong Zhang PetscFunctionReturn(0); 24358c24d83SHong Zhang } 244d50806bdSBarry Smith 245df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 246d50806bdSBarry Smith { 247dfbe8321SBarry Smith PetscErrorCode ierr; 248d13dce4bSSatish Balay PetscLogDouble flops=0.0; 249d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 250d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 251d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 25238baddfdSBarry Smith PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 253c58ca2e3SHong Zhang PetscInt am =A->rmap->n,cm=C->rmap->n; 254519eb980SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 2552e5835c6SStefano Zampini PetscScalar *ca,valtmp; 256aa1aec99SHong Zhang PetscScalar *ab_dense; 2576718818eSStefano Zampini PetscContainer cab_dense; 2582e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 259d50806bdSBarry Smith 260d50806bdSBarry Smith PetscFunctionBegin; 2612e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 2622e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 263aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 264854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 265aa1aec99SHong Zhang c->a = ca; 266aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 2676718818eSStefano Zampini } else ca = c->a; 2686718818eSStefano Zampini 2696718818eSStefano Zampini /* TODO this should be done in the symbolic phase */ 2706718818eSStefano Zampini /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 2716718818eSStefano Zampini that is hard to eradicate) */ 2726718818eSStefano Zampini ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr); 2736718818eSStefano Zampini if (!cab_dense) { 2746718818eSStefano Zampini ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 2756718818eSStefano Zampini ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr); 2766718818eSStefano Zampini ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr); 2776718818eSStefano Zampini ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr); 2786718818eSStefano Zampini ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr); 2796718818eSStefano Zampini ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr); 280d90d86d0SMatthew G. Knepley } 2816718818eSStefano Zampini ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr); 2826718818eSStefano Zampini ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr); 283aa1aec99SHong Zhang 284c124e916SHong Zhang /* clean old values in C */ 285580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 286d50806bdSBarry Smith /* Traverse A row-wise. */ 287d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 288d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 289d50806bdSBarry Smith for (i=0; i<am; i++) { 290d50806bdSBarry Smith anzi = ai[i+1] - ai[i]; 291d50806bdSBarry Smith for (j=0; j<anzi; j++) { 292519eb980SHong Zhang brow = aj[j]; 293d50806bdSBarry Smith bnzi = bi[brow+1] - bi[brow]; 294d50806bdSBarry Smith bjj = bj + bi[brow]; 295d50806bdSBarry Smith baj = ba + bi[brow]; 296519eb980SHong Zhang /* perform dense axpy */ 29736ec6d2dSHong Zhang valtmp = aa[j]; 298519eb980SHong Zhang for (k=0; k<bnzi; k++) { 29936ec6d2dSHong Zhang ab_dense[bjj[k]] += valtmp*baj[k]; 300519eb980SHong Zhang } 301519eb980SHong Zhang flops += 2*bnzi; 302519eb980SHong Zhang } 303c58ca2e3SHong Zhang aj += anzi; aa += anzi; 304c58ca2e3SHong Zhang 305c58ca2e3SHong Zhang cnzi = ci[i+1] - ci[i]; 306519eb980SHong Zhang for (k=0; k<cnzi; k++) { 307519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 308519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 309519eb980SHong Zhang } 310519eb980SHong Zhang flops += cnzi; 311519eb980SHong Zhang cj += cnzi; ca += cnzi; 312519eb980SHong Zhang } 3132e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3142e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3152e5835c6SStefano Zampini #endif 316c58ca2e3SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 317c58ca2e3SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 318c58ca2e3SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 3192e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 3202e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 321c58ca2e3SHong Zhang PetscFunctionReturn(0); 322c58ca2e3SHong Zhang } 323c58ca2e3SHong Zhang 32425023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 325c58ca2e3SHong Zhang { 326c58ca2e3SHong Zhang PetscErrorCode ierr; 327c58ca2e3SHong Zhang PetscLogDouble flops=0.0; 328c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 329c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 330c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 331c58ca2e3SHong Zhang PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 332c58ca2e3SHong Zhang PetscInt am = A->rmap->N,cm=C->rmap->N; 333c58ca2e3SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 3342e5835c6SStefano Zampini PetscScalar *ca=c->a,valtmp; 3352e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 336c58ca2e3SHong Zhang PetscInt nextb; 337c58ca2e3SHong Zhang 338c58ca2e3SHong Zhang PetscFunctionBegin; 3392e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 3402e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 341cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 342cf742fccSHong Zhang ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 343cf742fccSHong Zhang c->a = ca; 344cf742fccSHong Zhang c->free_a = PETSC_TRUE; 345cf742fccSHong Zhang } 346cf742fccSHong Zhang 347c58ca2e3SHong Zhang /* clean old values in C */ 348580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 349c58ca2e3SHong Zhang /* Traverse A row-wise. */ 350c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 351c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 352519eb980SHong Zhang for (i=0; i<am; i++) { 353519eb980SHong Zhang anzi = ai[i+1] - ai[i]; 354519eb980SHong Zhang cnzi = ci[i+1] - ci[i]; 355519eb980SHong Zhang for (j=0; j<anzi; j++) { 356519eb980SHong Zhang brow = aj[j]; 357519eb980SHong Zhang bnzi = bi[brow+1] - bi[brow]; 358519eb980SHong Zhang bjj = bj + bi[brow]; 359519eb980SHong Zhang baj = ba + bi[brow]; 360519eb980SHong Zhang /* perform sparse axpy */ 36136ec6d2dSHong Zhang valtmp = aa[j]; 362c124e916SHong Zhang nextb = 0; 363c124e916SHong Zhang for (k=0; nextb<bnzi; k++) { 364c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 36536ec6d2dSHong Zhang ca[k] += valtmp*baj[nextb++]; 366c124e916SHong Zhang } 367d50806bdSBarry Smith } 368d50806bdSBarry Smith flops += 2*bnzi; 369d50806bdSBarry Smith } 370519eb980SHong Zhang aj += anzi; aa += anzi; 371519eb980SHong Zhang cj += cnzi; ca += cnzi; 372d50806bdSBarry Smith } 3732e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3742e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3752e5835c6SStefano Zampini #endif 376716bacf3SKris Buschelman ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 377716bacf3SKris Buschelman ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 378d50806bdSBarry Smith ierr = PetscLogFlops(flops);CHKERRQ(ierr); 3792e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 3802e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 381d50806bdSBarry Smith PetscFunctionReturn(0); 382d50806bdSBarry Smith } 383bc011b1eSHong Zhang 3844222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C) 38525296bd5SBarry Smith { 38625296bd5SBarry Smith PetscErrorCode ierr; 38725296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 38825296bd5SBarry Smith PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 3893c50cad2SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 39025296bd5SBarry Smith MatScalar *ca; 39125296bd5SBarry Smith PetscReal afill; 392eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 393eca6b86aSHong Zhang PetscTable ta; 3940298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 39525296bd5SBarry Smith 39625296bd5SBarry Smith PetscFunctionBegin; 3973c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 3983c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 3993c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 400854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 4013c50cad2SHong Zhang ci[0] = 0; 4023c50cad2SHong Zhang 4033c50cad2SHong Zhang /* create and initialize a linked list */ 404c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 405c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 406eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 407eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 408eca6b86aSHong Zhang 409eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 4103c50cad2SHong Zhang 4113c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 412f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 4133c50cad2SHong Zhang current_space = free_space; 4143c50cad2SHong Zhang 4153c50cad2SHong Zhang /* Determine ci and cj */ 4163c50cad2SHong Zhang for (i=0; i<am; i++) { 4173c50cad2SHong Zhang anzi = ai[i+1] - ai[i]; 4183c50cad2SHong Zhang aj = a->j + ai[i]; 4193c50cad2SHong Zhang for (j=0; j<anzi; j++) { 4203c50cad2SHong Zhang brow = aj[j]; 4213c50cad2SHong Zhang bnzj = bi[brow+1] - bi[brow]; 4223c50cad2SHong Zhang bj = b->j + bi[brow]; 4233c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 4243c50cad2SHong Zhang ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 4253c50cad2SHong Zhang } 4268c78258cSHong Zhang /* add possible missing diagonal entry */ 4278c78258cSHong Zhang if (C->force_diagonals) { 4288c78258cSHong Zhang ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr); 4298c78258cSHong Zhang } 4303c50cad2SHong Zhang cnzi = lnk[1]; 4313c50cad2SHong Zhang 4323c50cad2SHong Zhang /* If free space is not available, make more free space */ 4333c50cad2SHong Zhang /* Double the amount of total space in the list */ 4343c50cad2SHong Zhang if (current_space->local_remaining<cnzi) { 435f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 4363c50cad2SHong Zhang ndouble++; 4373c50cad2SHong Zhang } 4383c50cad2SHong Zhang 4393c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 4403c50cad2SHong Zhang ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 4412205254eSKarl Rupp 4423c50cad2SHong Zhang current_space->array += cnzi; 4433c50cad2SHong Zhang current_space->local_used += cnzi; 4443c50cad2SHong Zhang current_space->local_remaining -= cnzi; 4452205254eSKarl Rupp 4463c50cad2SHong Zhang ci[i+1] = ci[i] + cnzi; 4473c50cad2SHong Zhang } 4483c50cad2SHong Zhang 4493c50cad2SHong Zhang /* Column indices are in the list of free space */ 4503c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 4513c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 452854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 4533c50cad2SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 4543c50cad2SHong Zhang ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 45525296bd5SBarry Smith 45625296bd5SBarry Smith /* Allocate space for ca */ 457580bdb30SBarry Smith ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 45825296bd5SBarry Smith 45925296bd5SBarry Smith /* put together the new symbolic matrix */ 460e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 4614222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 46225296bd5SBarry Smith 46325296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 46425296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 4654222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 46625296bd5SBarry Smith c->free_a = PETSC_TRUE; 46725296bd5SBarry Smith c->free_ij = PETSC_TRUE; 46825296bd5SBarry Smith c->nonew = 0; 4692205254eSKarl Rupp 4704222ddf1SHong Zhang /* slower, less memory */ 4714222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 47225296bd5SBarry Smith 47325296bd5SBarry Smith /* set MatInfo */ 47425296bd5SBarry Smith afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 47525296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 47625296bd5SBarry Smith c->maxnz = ci[am]; 47725296bd5SBarry Smith c->nz = ci[am]; 4784222ddf1SHong Zhang C->info.mallocs = ndouble; 4794222ddf1SHong Zhang C->info.fill_ratio_given = fill; 4804222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 48125296bd5SBarry Smith 48225296bd5SBarry Smith #if defined(PETSC_USE_INFO) 48325296bd5SBarry Smith if (ci[am]) { 4847d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 4857d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 48625296bd5SBarry Smith } else { 4874222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 48825296bd5SBarry Smith } 48925296bd5SBarry Smith #endif 49025296bd5SBarry Smith PetscFunctionReturn(0); 49125296bd5SBarry Smith } 49225296bd5SBarry Smith 4934222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C) 494e9e4536cSHong Zhang { 495e9e4536cSHong Zhang PetscErrorCode ierr; 496e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 497bf9555e6SHong Zhang PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 49825c41797SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 499e9e4536cSHong Zhang MatScalar *ca; 500e9e4536cSHong Zhang PetscReal afill; 501eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 502eca6b86aSHong Zhang PetscTable ta; 5030298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 504e9e4536cSHong Zhang 505e9e4536cSHong Zhang PetscFunctionBegin; 506bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 507bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 508bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 509854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 510bd958071SHong Zhang ci[0] = 0; 511bd958071SHong Zhang 512bd958071SHong Zhang /* create and initialize a linked list */ 513c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 514c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 515eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 516eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 517eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 518bd958071SHong Zhang 519bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 520f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 521bd958071SHong Zhang current_space = free_space; 522bd958071SHong Zhang 523bd958071SHong Zhang /* Determine ci and cj */ 524bd958071SHong Zhang for (i=0; i<am; i++) { 525bd958071SHong Zhang anzi = ai[i+1] - ai[i]; 526bd958071SHong Zhang aj = a->j + ai[i]; 527bd958071SHong Zhang for (j=0; j<anzi; j++) { 528bd958071SHong Zhang brow = aj[j]; 529bd958071SHong Zhang bnzj = bi[brow+1] - bi[brow]; 530bd958071SHong Zhang bj = b->j + bi[brow]; 531bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 532bd958071SHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 533bd958071SHong Zhang } 5348c78258cSHong Zhang /* add possible missing diagonal entry */ 5358c78258cSHong Zhang if (C->force_diagonals) { 5368c78258cSHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr); 5378c78258cSHong Zhang } 5388c78258cSHong Zhang 539bd958071SHong Zhang cnzi = lnk[0]; 540bd958071SHong Zhang 541bd958071SHong Zhang /* If free space is not available, make more free space */ 542bd958071SHong Zhang /* Double the amount of total space in the list */ 543bd958071SHong Zhang if (current_space->local_remaining<cnzi) { 544f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 545bd958071SHong Zhang ndouble++; 546bd958071SHong Zhang } 547bd958071SHong Zhang 548bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 549bd958071SHong Zhang ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 5502205254eSKarl Rupp 551bd958071SHong Zhang current_space->array += cnzi; 552bd958071SHong Zhang current_space->local_used += cnzi; 553bd958071SHong Zhang current_space->local_remaining -= cnzi; 5542205254eSKarl Rupp 555bd958071SHong Zhang ci[i+1] = ci[i] + cnzi; 556bd958071SHong Zhang } 557bd958071SHong Zhang 558bd958071SHong Zhang /* Column indices are in the list of free space */ 559bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 560bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 561854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 562bd958071SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 563bd958071SHong Zhang ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 564e9e4536cSHong Zhang 565e9e4536cSHong Zhang /* Allocate space for ca */ 566bd958071SHong Zhang /*-----------------------*/ 567580bdb30SBarry Smith ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 568e9e4536cSHong Zhang 569e9e4536cSHong Zhang /* put together the new symbolic matrix */ 570e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 5714222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 572e9e4536cSHong Zhang 573e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 574e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5754222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 576e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 577e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 578e9e4536cSHong Zhang c->nonew = 0; 5792205254eSKarl Rupp 5804222ddf1SHong Zhang /* slower, less memory */ 5814222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 582e9e4536cSHong Zhang 583e9e4536cSHong Zhang /* set MatInfo */ 584e9e4536cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 585e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 586e9e4536cSHong Zhang c->maxnz = ci[am]; 587e9e4536cSHong Zhang c->nz = ci[am]; 5884222ddf1SHong Zhang C->info.mallocs = ndouble; 5894222ddf1SHong Zhang C->info.fill_ratio_given = fill; 5904222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 591e9e4536cSHong Zhang 592e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 593e9e4536cSHong Zhang if (ci[am]) { 5947d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 5957d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 596e9e4536cSHong Zhang } else { 5974222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 598e9e4536cSHong Zhang } 599e9e4536cSHong Zhang #endif 600e9e4536cSHong Zhang PetscFunctionReturn(0); 601e9e4536cSHong Zhang } 602e9e4536cSHong Zhang 6034222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C) 6040ced3a2bSJed Brown { 6050ced3a2bSJed Brown PetscErrorCode ierr; 6060ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 6070ced3a2bSJed Brown const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 6080ced3a2bSJed Brown PetscInt *ci,*cj,*bb; 6090ced3a2bSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 6100ced3a2bSJed Brown PetscReal afill; 6110ced3a2bSJed Brown PetscInt i,j,col,ndouble = 0; 6120298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 6130ced3a2bSJed Brown PetscHeap h; 6140ced3a2bSJed Brown 6150ced3a2bSJed Brown PetscFunctionBegin; 616cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 6170ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 6180ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 619854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 6200ced3a2bSJed Brown ci[0] = 0; 6210ced3a2bSJed Brown 6220ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 623f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 6240ced3a2bSJed Brown current_space = free_space; 6250ced3a2bSJed Brown 6260ced3a2bSJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 627785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 6280ced3a2bSJed Brown 6290ced3a2bSJed Brown /* Determine ci and cj */ 6300ced3a2bSJed Brown for (i=0; i<am; i++) { 6310ced3a2bSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6320ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6330ced3a2bSJed Brown ci[i+1] = ci[i]; 6340ced3a2bSJed Brown /* Populate the min heap */ 6350ced3a2bSJed Brown for (j=0; j<anzi; j++) { 6360ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 6370ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 6380ced3a2bSJed Brown ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 6390ced3a2bSJed Brown } 6400ced3a2bSJed Brown } 6410ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 6420ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6430ced3a2bSJed Brown while (j >= 0) { 6440ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 645f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 6460ced3a2bSJed Brown ndouble++; 6470ced3a2bSJed Brown } 6480ced3a2bSJed Brown *(current_space->array++) = col; 6490ced3a2bSJed Brown current_space->local_used++; 6500ced3a2bSJed Brown current_space->local_remaining--; 6510ced3a2bSJed Brown ci[i+1]++; 6520ced3a2bSJed Brown 6530ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 6540ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 6550ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 6560ced3a2bSJed Brown PetscInt j2,col2; 6570ced3a2bSJed Brown ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 6580ced3a2bSJed Brown if (col2 != col) break; 6590ced3a2bSJed Brown ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 6600ced3a2bSJed Brown if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 6610ced3a2bSJed Brown } 6620ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 6630ced3a2bSJed Brown ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 6640ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6650ced3a2bSJed Brown } 6660ced3a2bSJed Brown } 6670ced3a2bSJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 6680ced3a2bSJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 6690ced3a2bSJed Brown 6700ced3a2bSJed Brown /* Column indices are in the list of free space */ 6710ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 6720ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 673785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 6740ced3a2bSJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 6750ced3a2bSJed Brown 6760ced3a2bSJed Brown /* put together the new symbolic matrix */ 677e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 6784222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 6790ced3a2bSJed Brown 6800ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6810ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 6824222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 6830ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6840ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 6850ced3a2bSJed Brown c->nonew = 0; 68626fbe8dcSKarl Rupp 6874222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 6880ced3a2bSJed Brown 6890ced3a2bSJed Brown /* set MatInfo */ 6900ced3a2bSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 6910ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6920ced3a2bSJed Brown c->maxnz = ci[am]; 6930ced3a2bSJed Brown c->nz = ci[am]; 6944222ddf1SHong Zhang C->info.mallocs = ndouble; 6954222ddf1SHong Zhang C->info.fill_ratio_given = fill; 6964222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 6970ced3a2bSJed Brown 6980ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 6990ced3a2bSJed Brown if (ci[am]) { 7007d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 7017d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 7020ced3a2bSJed Brown } else { 7034222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 7040ced3a2bSJed Brown } 7050ced3a2bSJed Brown #endif 7060ced3a2bSJed Brown PetscFunctionReturn(0); 7070ced3a2bSJed Brown } 708e9e4536cSHong Zhang 7094222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C) 7108a07c6f1SJed Brown { 7118a07c6f1SJed Brown PetscErrorCode ierr; 7128a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 7138a07c6f1SJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 7148a07c6f1SJed Brown PetscInt *ci,*cj,*bb; 7158a07c6f1SJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 7168a07c6f1SJed Brown PetscReal afill; 7178a07c6f1SJed Brown PetscInt i,j,col,ndouble = 0; 7180298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 7198a07c6f1SJed Brown PetscHeap h; 7208a07c6f1SJed Brown PetscBT bt; 7218a07c6f1SJed Brown 7228a07c6f1SJed Brown PetscFunctionBegin; 723cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 7248a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 7258a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 726854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 7278a07c6f1SJed Brown ci[0] = 0; 7288a07c6f1SJed Brown 7298a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 730f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 7312205254eSKarl Rupp 7328a07c6f1SJed Brown current_space = free_space; 7338a07c6f1SJed Brown 7348a07c6f1SJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 735785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 7368a07c6f1SJed Brown ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 7378a07c6f1SJed Brown 7388a07c6f1SJed Brown /* Determine ci and cj */ 7398a07c6f1SJed Brown for (i=0; i<am; i++) { 7408a07c6f1SJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 7418a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 7428a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 7438a07c6f1SJed Brown ci[i+1] = ci[i]; 7448a07c6f1SJed Brown /* Populate the min heap */ 7458a07c6f1SJed Brown for (j=0; j<anzi; j++) { 7468a07c6f1SJed Brown PetscInt brow = acol[j]; 7478a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 7488a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7498a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7508a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 7518a07c6f1SJed Brown bb[j]++; 7528a07c6f1SJed Brown break; 7538a07c6f1SJed Brown } 7548a07c6f1SJed Brown } 7558a07c6f1SJed Brown } 7568a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 7578a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 7588a07c6f1SJed Brown while (j >= 0) { 7598a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 7600298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 761f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 7628a07c6f1SJed Brown ndouble++; 7638a07c6f1SJed Brown } 7648a07c6f1SJed Brown *(current_space->array++) = col; 7658a07c6f1SJed Brown current_space->local_used++; 7668a07c6f1SJed Brown current_space->local_remaining--; 7678a07c6f1SJed Brown ci[i+1]++; 7688a07c6f1SJed Brown 7698a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 7708a07c6f1SJed Brown for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 7718a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7728a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7738a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 7748a07c6f1SJed Brown bb[j]++; 7758a07c6f1SJed Brown break; 7768a07c6f1SJed Brown } 7778a07c6f1SJed Brown } 7788a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 7798a07c6f1SJed Brown } 7808a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7818a07c6f1SJed Brown for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 7828a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7838a07c6f1SJed Brown ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 7848a07c6f1SJed Brown } 7858a07c6f1SJed Brown } 7868a07c6f1SJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 7878a07c6f1SJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 7888a07c6f1SJed Brown ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 7898a07c6f1SJed Brown 7908a07c6f1SJed Brown /* Column indices are in the list of free space */ 7918a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7928a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 793785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 7948a07c6f1SJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 7958a07c6f1SJed Brown 7968a07c6f1SJed Brown /* put together the new symbolic matrix */ 797e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 7984222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 7998a07c6f1SJed Brown 8008a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 8018a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 8024222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 8038a07c6f1SJed Brown c->free_a = PETSC_TRUE; 8048a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 8058a07c6f1SJed Brown c->nonew = 0; 80626fbe8dcSKarl Rupp 8074222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 8088a07c6f1SJed Brown 8098a07c6f1SJed Brown /* set MatInfo */ 8108a07c6f1SJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 8118a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 8128a07c6f1SJed Brown c->maxnz = ci[am]; 8138a07c6f1SJed Brown c->nz = ci[am]; 8144222ddf1SHong Zhang C->info.mallocs = ndouble; 8154222ddf1SHong Zhang C->info.fill_ratio_given = fill; 8164222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 8178a07c6f1SJed Brown 8188a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 8198a07c6f1SJed Brown if (ci[am]) { 8207d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 8217d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 8228a07c6f1SJed Brown } else { 8234222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 8248a07c6f1SJed Brown } 8258a07c6f1SJed Brown #endif 8268a07c6f1SJed Brown PetscFunctionReturn(0); 8278a07c6f1SJed Brown } 8288a07c6f1SJed Brown 8294222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C) 830d7ed1a4dSandi selinger { 831d7ed1a4dSandi selinger PetscErrorCode ierr; 832d7ed1a4dSandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 833d7ed1a4dSandi selinger const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 834d7ed1a4dSandi selinger PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 835d7ed1a4dSandi selinger PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 836d7ed1a4dSandi selinger const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 837d7ed1a4dSandi selinger const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 838d7ed1a4dSandi selinger const PetscInt *brow_ptr[8],*brow_end[8]; 839d7ed1a4dSandi selinger PetscInt window[8]; 840d7ed1a4dSandi selinger PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 841d7ed1a4dSandi selinger PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 842d7ed1a4dSandi selinger PetscReal afill; 843f83700f2Sandi selinger PetscInt *workj_L1,*workj_L2,*workj_L3; 8447660c4dbSandi selinger PetscInt L1_nnz,L2_nnz; 845d7ed1a4dSandi selinger 846d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 847d7ed1a4dSandi selinger Because of the way virtual memory works, 848d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 849d7ed1a4dSandi selinger PetscFunctionBegin; 850d7ed1a4dSandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 851d7ed1a4dSandi selinger for (i=0; i<am; i++) { 852d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 853d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 854d7ed1a4dSandi selinger a_rownnz = 0; 855d7ed1a4dSandi selinger for (k=0; k<anzi; ++k) { 856d7ed1a4dSandi selinger a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 857d7ed1a4dSandi selinger if (a_rownnz > bn) { 858d7ed1a4dSandi selinger a_rownnz = bn; 859d7ed1a4dSandi selinger break; 860d7ed1a4dSandi selinger } 861d7ed1a4dSandi selinger } 862d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 863d7ed1a4dSandi selinger } 864d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 865d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 866f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 867f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr); 868d7ed1a4dSandi selinger 8697660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 8707660c4dbSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 871d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 872d7ed1a4dSandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 873d7ed1a4dSandi selinger 874d7ed1a4dSandi selinger ci_nnz = 0; 875d7ed1a4dSandi selinger ci[0] = 0; 876d7ed1a4dSandi selinger worki_L1[0] = 0; 877d7ed1a4dSandi selinger worki_L2[0] = 0; 878d7ed1a4dSandi selinger for (i=0; i<am; i++) { 879d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 880d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 881d7ed1a4dSandi selinger rowsleft = anzi; 882d7ed1a4dSandi selinger inputcol_L1 = acol; 883d7ed1a4dSandi selinger L2_nnz = 0; 8847660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8857660c4dbSandi selinger worki_L2[1] = 0; 88608fe1b3cSKarl Rupp outputi_nnz = 0; 887d7ed1a4dSandi selinger 888d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 889d7ed1a4dSandi selinger while (ci_nnz+a_maxrownnz > c_maxmem) { 890d7ed1a4dSandi selinger c_maxmem *= 2; 891d7ed1a4dSandi selinger ndouble++; 892d7ed1a4dSandi selinger ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 893d7ed1a4dSandi selinger } 894d7ed1a4dSandi selinger 895d7ed1a4dSandi selinger while (rowsleft) { 896d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 897d7ed1a4dSandi selinger L1_nrows = 0; 8987660c4dbSandi selinger L1_nnz = 0; 899d7ed1a4dSandi selinger inputcol = inputcol_L1; 900d7ed1a4dSandi selinger inputi = bi; 901d7ed1a4dSandi selinger inputj = bj; 902d7ed1a4dSandi selinger 903d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 904d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 905f83700f2Sandi selinger Input: inputj inputi inputcol bn 906d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 907d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 908d7ed1a4dSandi selinger window_min = bn; \ 9097660c4dbSandi selinger outputi_nnz = 0; \ 9107660c4dbSandi selinger for (k=0; k<ANNZ; ++k) { \ 911d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 912d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 913d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 914d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 915d7ed1a4dSandi selinger } \ 916d7ed1a4dSandi selinger while (window_min < bn) { \ 917d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 918d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 919d7ed1a4dSandi selinger old_window_min = window_min; \ 920d7ed1a4dSandi selinger window_min = bn; \ 921d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 922d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 923d7ed1a4dSandi selinger brow_ptr[k]++; \ 924d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 925d7ed1a4dSandi selinger } \ 926d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 927d7ed1a4dSandi selinger } \ 928d7ed1a4dSandi selinger } 929d7ed1a4dSandi selinger 930d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 931d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 932d7ed1a4dSandi selinger while (L1_rowsleft) { 9337660c4dbSandi selinger outputi_nnz = 0; 9347660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 9357660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 9367660c4dbSandi selinger 937d7ed1a4dSandi selinger switch (L1_rowsleft) { 938d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 939d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 940d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 941d7ed1a4dSandi selinger inputcol += L1_rowsleft; 942d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 943d7ed1a4dSandi selinger L1_rowsleft = 0; 944d7ed1a4dSandi selinger break; 945d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); 946d7ed1a4dSandi selinger inputcol += L1_rowsleft; 947d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 948d7ed1a4dSandi selinger L1_rowsleft = 0; 949d7ed1a4dSandi selinger break; 950d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); 951d7ed1a4dSandi selinger inputcol += L1_rowsleft; 952d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 953d7ed1a4dSandi selinger L1_rowsleft = 0; 954d7ed1a4dSandi selinger break; 955d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); 956d7ed1a4dSandi selinger inputcol += L1_rowsleft; 957d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 958d7ed1a4dSandi selinger L1_rowsleft = 0; 959d7ed1a4dSandi selinger break; 960d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); 961d7ed1a4dSandi selinger inputcol += L1_rowsleft; 962d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 963d7ed1a4dSandi selinger L1_rowsleft = 0; 964d7ed1a4dSandi selinger break; 965d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); 966d7ed1a4dSandi selinger inputcol += L1_rowsleft; 967d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 968d7ed1a4dSandi selinger L1_rowsleft = 0; 969d7ed1a4dSandi selinger break; 970d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); 971d7ed1a4dSandi selinger inputcol += L1_rowsleft; 972d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 973d7ed1a4dSandi selinger L1_rowsleft = 0; 974d7ed1a4dSandi selinger break; 975d7ed1a4dSandi selinger default: MatMatMultSymbolic_RowMergeMacro(8); 976d7ed1a4dSandi selinger inputcol += 8; 977d7ed1a4dSandi selinger rowsleft -= 8; 978d7ed1a4dSandi selinger L1_rowsleft -= 8; 979d7ed1a4dSandi selinger break; 980d7ed1a4dSandi selinger } 981d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9827660c4dbSandi selinger L1_nnz += outputi_nnz; 9837660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 984d7ed1a4dSandi selinger } 985d7ed1a4dSandi selinger 986d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 987d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 988d7ed1a4dSandi selinger if (anzi > 8) { 989d7ed1a4dSandi selinger inputi = worki_L1; 990d7ed1a4dSandi selinger inputj = workj_L1; 991d7ed1a4dSandi selinger inputcol = workcol; 992d7ed1a4dSandi selinger outputi_nnz = 0; 993d7ed1a4dSandi selinger 994d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 995d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 996d7ed1a4dSandi selinger 997d7ed1a4dSandi selinger switch (L1_nrows) { 998d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 999d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 1000d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1001d7ed1a4dSandi selinger break; 1002d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1003d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1004d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1005d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1006d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1007d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1008d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1009d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1010d7ed1a4dSandi selinger } 1011d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 1012d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 1013d7ed1a4dSandi selinger 10147660c4dbSandi selinger /************************ L E V E L 3 **********************/ 10157660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 1016d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1017d7ed1a4dSandi selinger inputi = worki_L2; 1018d7ed1a4dSandi selinger inputj = workj_L2; 1019d7ed1a4dSandi selinger inputcol = workcol; 1020d7ed1a4dSandi selinger outputi_nnz = 0; 1021f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 1022d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 1023d7ed1a4dSandi selinger switch (L2_nrows) { 1024d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1025d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 1026d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1027d7ed1a4dSandi selinger break; 1028d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1029d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1030d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1031d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1032d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1033d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1034d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1035d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1036d7ed1a4dSandi selinger } 1037d7ed1a4dSandi selinger L2_nrows = 1; 10387660c4dbSandi selinger L2_nnz = outputi_nnz; 10397660c4dbSandi selinger worki_L2[1] = outputi_nnz; 10407660c4dbSandi selinger /* Copy to workj_L2 */ 1041d7ed1a4dSandi selinger if (rowsleft) { 10427660c4dbSandi selinger for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1043d7ed1a4dSandi selinger } 1044d7ed1a4dSandi selinger } 1045d7ed1a4dSandi selinger } 1046d7ed1a4dSandi selinger } /* while (rowsleft) */ 1047d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 1048d7ed1a4dSandi selinger 1049d7ed1a4dSandi selinger /* terminate current row */ 1050d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 1051d7ed1a4dSandi selinger ci[i+1] = ci_nnz; 1052d7ed1a4dSandi selinger } 1053d7ed1a4dSandi selinger 1054d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 1055e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 10564222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1057d7ed1a4dSandi selinger 1058d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1059d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 10604222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1061d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 1062d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 1063d7ed1a4dSandi selinger c->nonew = 0; 1064d7ed1a4dSandi selinger 10654222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1066d7ed1a4dSandi selinger 1067d7ed1a4dSandi selinger /* set MatInfo */ 1068d7ed1a4dSandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1069d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 1070d7ed1a4dSandi selinger c->maxnz = ci[am]; 1071d7ed1a4dSandi selinger c->nz = ci[am]; 10724222ddf1SHong Zhang C->info.mallocs = ndouble; 10734222ddf1SHong Zhang C->info.fill_ratio_given = fill; 10744222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1075d7ed1a4dSandi selinger 1076d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1077d7ed1a4dSandi selinger if (ci[am]) { 10787d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 10797d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1080d7ed1a4dSandi selinger } else { 10814222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1082d7ed1a4dSandi selinger } 1083d7ed1a4dSandi selinger #endif 1084d7ed1a4dSandi selinger 1085d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 1086d7ed1a4dSandi selinger ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1087d7ed1a4dSandi selinger ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1088f83700f2Sandi selinger ierr = PetscFree(workj_L3);CHKERRQ(ierr); 1089d7ed1a4dSandi selinger PetscFunctionReturn(0); 1090d7ed1a4dSandi selinger } 1091d7ed1a4dSandi selinger 1092cd093f1dSJed Brown /* concatenate unique entries and then sort */ 10934222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C) 1094cd093f1dSJed Brown { 1095cd093f1dSJed Brown PetscErrorCode ierr; 1096cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1097cd093f1dSJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 10988c78258cSHong Zhang PetscInt *ci,*cj,bcol; 1099cd093f1dSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1100cd093f1dSJed Brown PetscReal afill; 1101cd093f1dSJed Brown PetscInt i,j,ndouble = 0; 1102cd093f1dSJed Brown PetscSegBuffer seg,segrow; 1103cd093f1dSJed Brown char *seen; 1104cd093f1dSJed Brown 1105cd093f1dSJed Brown PetscFunctionBegin; 1106854ce69bSBarry Smith ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1107cd093f1dSJed Brown ci[0] = 0; 1108cd093f1dSJed Brown 1109cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1110cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1111cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1112580bdb30SBarry Smith ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr); 1113cd093f1dSJed Brown 1114cd093f1dSJed Brown /* Determine ci and cj */ 1115cd093f1dSJed Brown for (i=0; i<am; i++) { 1116cd093f1dSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1117cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1118cd093f1dSJed Brown PetscInt packlen = 0,*PETSC_RESTRICT crow; 11198c78258cSHong Zhang 1120cd093f1dSJed Brown /* Pack segrow */ 1121cd093f1dSJed Brown for (j=0; j<anzi; j++) { 1122cd093f1dSJed Brown PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1123cd093f1dSJed Brown for (k=bjstart; k<bjend; k++) { 11248c78258cSHong Zhang bcol = bj[k]; 1125cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1126cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 1127cd093f1dSJed Brown ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1128cd093f1dSJed Brown *slot = bcol; 1129cd093f1dSJed Brown seen[bcol] = 1; 1130cd093f1dSJed Brown packlen++; 1131cd093f1dSJed Brown } 1132cd093f1dSJed Brown } 1133cd093f1dSJed Brown } 11348c78258cSHong Zhang 11358c78258cSHong Zhang /* Check i-th diagonal entry */ 11368c78258cSHong Zhang if (C->force_diagonals && !seen[i]) { 11378c78258cSHong Zhang PetscInt *PETSC_RESTRICT slot; 11388c78258cSHong Zhang ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 11398c78258cSHong Zhang *slot = i; 11408c78258cSHong Zhang seen[i] = 1; 11418c78258cSHong Zhang packlen++; 11428c78258cSHong Zhang } 11438c78258cSHong Zhang 1144cd093f1dSJed Brown ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1145cd093f1dSJed Brown ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1146cd093f1dSJed Brown ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1147cd093f1dSJed Brown ci[i+1] = ci[i] + packlen; 1148cd093f1dSJed Brown for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1149cd093f1dSJed Brown } 1150cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1151cd093f1dSJed Brown ierr = PetscFree(seen);CHKERRQ(ierr); 1152cd093f1dSJed Brown 1153cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 1154cd093f1dSJed Brown ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1155cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1156cd093f1dSJed Brown 1157cd093f1dSJed Brown /* put together the new symbolic matrix */ 1158e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 11594222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1160cd093f1dSJed Brown 1161cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1162cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 11634222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1164cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1165cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1166cd093f1dSJed Brown c->nonew = 0; 1167cd093f1dSJed Brown 11684222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1169cd093f1dSJed Brown 1170cd093f1dSJed Brown /* set MatInfo */ 11712a09556fSStefano Zampini afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5; 1172cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 1173cd093f1dSJed Brown c->maxnz = ci[am]; 1174cd093f1dSJed Brown c->nz = ci[am]; 11754222ddf1SHong Zhang C->info.mallocs = ndouble; 11764222ddf1SHong Zhang C->info.fill_ratio_given = fill; 11774222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1178cd093f1dSJed Brown 1179cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1180cd093f1dSJed Brown if (ci[am]) { 11817d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 11827d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1183cd093f1dSJed Brown } else { 11844222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1185cd093f1dSJed Brown } 1186cd093f1dSJed Brown #endif 1187cd093f1dSJed Brown PetscFunctionReturn(0); 1188cd093f1dSJed Brown } 1189cd093f1dSJed Brown 11906718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 11912128a86cSHong Zhang { 11922128a86cSHong Zhang PetscErrorCode ierr; 11936718818eSStefano Zampini Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data; 11942128a86cSHong Zhang 11952128a86cSHong Zhang PetscFunctionBegin; 119640192850SHong Zhang ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 119740192850SHong Zhang ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 119840192850SHong Zhang ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 119940192850SHong Zhang ierr = PetscFree(abt);CHKERRQ(ierr); 12002128a86cSHong Zhang PetscFunctionReturn(0); 12012128a86cSHong Zhang } 12022128a86cSHong Zhang 12034222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1204bc011b1eSHong Zhang { 12055df89d91SHong Zhang PetscErrorCode ierr; 120681d82fe4SHong Zhang Mat Bt; 120781d82fe4SHong Zhang PetscInt *bti,*btj; 120840192850SHong Zhang Mat_MatMatTransMult *abt; 12094222ddf1SHong Zhang Mat_Product *product = C->product; 12106718818eSStefano Zampini char *alg; 1211d2853540SHong Zhang 121281d82fe4SHong Zhang PetscFunctionBegin; 1213*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1214*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 12156718818eSStefano Zampini 121681d82fe4SHong Zhang /* create symbolic Bt */ 121781d82fe4SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12180298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 121933d57670SJed Brown ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 122004b858e0SBarry Smith ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 122181d82fe4SHong Zhang 122281d82fe4SHong Zhang /* get symbolic C=A*Bt */ 12236718818eSStefano Zampini ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr); 12244222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */ 122581d82fe4SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 12264222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */ 12276718818eSStefano Zampini ierr = PetscFree(alg);CHKERRQ(ierr); 122881d82fe4SHong Zhang 1229a5b23f4aSJose E. Roman /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1230b00a9115SJed Brown ierr = PetscNew(&abt);CHKERRQ(ierr); 12312128a86cSHong Zhang 12326718818eSStefano Zampini product->data = abt; 12336718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 12346718818eSStefano Zampini 12354222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 12362128a86cSHong Zhang 12374222ddf1SHong Zhang abt->usecoloring = PETSC_FALSE; 12384222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr); 123940192850SHong Zhang if (abt->usecoloring) { 1240b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1241b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1242335efc43SPeter Brune MatColoring coloring; 12432128a86cSHong Zhang ISColoring iscoloring; 12442128a86cSHong Zhang Mat Bt_dense,C_dense; 1245e8354b3eSHong Zhang 12464222ddf1SHong Zhang /* inode causes memory problem */ 12474222ddf1SHong Zhang ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr); 12484222ddf1SHong Zhang 12494222ddf1SHong Zhang ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr); 1250335efc43SPeter Brune ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1251335efc43SPeter Brune ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1252335efc43SPeter Brune ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1253335efc43SPeter Brune ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1254335efc43SPeter Brune ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 12554222ddf1SHong Zhang ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr); 12562205254eSKarl Rupp 125740192850SHong Zhang abt->matcoloring = matcoloring; 12582205254eSKarl Rupp 12592128a86cSHong Zhang ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 12602128a86cSHong Zhang 12612128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 12622128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 12632128a86cSHong Zhang ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12642128a86cSHong Zhang ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 12650298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 12662205254eSKarl Rupp 12672128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 126840192850SHong Zhang abt->Bt_den = Bt_dense; 12692128a86cSHong Zhang 12702128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 12712128a86cSHong Zhang ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12722128a86cSHong Zhang ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 12730298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 12742205254eSKarl Rupp 12752128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 127640192850SHong Zhang abt->ABt_den = C_dense; 1277f94ccd6cSHong Zhang 1278f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12791ce71dffSSatish Balay { 12804222ddf1SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 12817d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors))));CHKERRQ(ierr); 12821ce71dffSSatish Balay } 1283f94ccd6cSHong Zhang #endif 12842128a86cSHong Zhang } 1285e99be685SHong Zhang /* clean up */ 1286e99be685SHong Zhang ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1287e99be685SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12885df89d91SHong Zhang PetscFunctionReturn(0); 12895df89d91SHong Zhang } 12905df89d91SHong Zhang 12916fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 12925df89d91SHong Zhang { 12935df89d91SHong Zhang PetscErrorCode ierr; 12945df89d91SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1295e2cac8caSJed Brown PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 129681d82fe4SHong Zhang PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 12975df89d91SHong Zhang PetscLogDouble flops=0.0; 1298aa1aec99SHong Zhang MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 12996718818eSStefano Zampini Mat_MatMatTransMult *abt; 13006718818eSStefano Zampini Mat_Product *product = C->product; 13015df89d91SHong Zhang 13025df89d91SHong Zhang PetscFunctionBegin; 1303*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 13046718818eSStefano Zampini abt = (Mat_MatMatTransMult *)product->data; 1305*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 130658ed3ceaSHong Zhang /* clear old values in C */ 130758ed3ceaSHong Zhang if (!c->a) { 1308580bdb30SBarry Smith ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 130958ed3ceaSHong Zhang c->a = ca; 131058ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 131158ed3ceaSHong Zhang } else { 131258ed3ceaSHong Zhang ca = c->a; 1313580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr); 131458ed3ceaSHong Zhang } 131558ed3ceaSHong Zhang 131640192850SHong Zhang if (abt->usecoloring) { 131740192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 131840192850SHong Zhang Mat Bt_dense,C_dense = abt->ABt_den; 1319c8db22f6SHong Zhang 1320b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 132140192850SHong Zhang Bt_dense = abt->Bt_den; 1322b9af6bddSHong Zhang ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1323c8db22f6SHong Zhang 1324c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 13252128a86cSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1326c8db22f6SHong Zhang 13272128a86cSHong Zhang /* Recover C from C_dense */ 1328b9af6bddSHong Zhang ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1329c8db22f6SHong Zhang PetscFunctionReturn(0); 1330c8db22f6SHong Zhang } 1331c8db22f6SHong Zhang 13321fa1209cSHong Zhang for (i=0; i<cm; i++) { 133381d82fe4SHong Zhang anzi = ai[i+1] - ai[i]; 133481d82fe4SHong Zhang acol = aj + ai[i]; 13356973769bSHong Zhang aval = aa + ai[i]; 13361fa1209cSHong Zhang cnzi = ci[i+1] - ci[i]; 13371fa1209cSHong Zhang ccol = cj + ci[i]; 13386973769bSHong Zhang cval = ca + ci[i]; 13391fa1209cSHong Zhang for (j=0; j<cnzi; j++) { 134081d82fe4SHong Zhang brow = ccol[j]; 134181d82fe4SHong Zhang bnzj = bi[brow+1] - bi[brow]; 134281d82fe4SHong Zhang bcol = bj + bi[brow]; 13436973769bSHong Zhang bval = ba + bi[brow]; 13446973769bSHong Zhang 134581d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 134681d82fe4SHong Zhang nexta = 0; nextb = 0; 134781d82fe4SHong Zhang while (nexta<anzi && nextb<bnzj) { 13487b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 134981d82fe4SHong Zhang if (nexta == anzi) break; 13507b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 135181d82fe4SHong Zhang if (nextb == bnzj) break; 135281d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 13536973769bSHong Zhang cval[j] += aval[nexta]*bval[nextb]; 135481d82fe4SHong Zhang nexta++; nextb++; 135581d82fe4SHong Zhang flops += 2; 13561fa1209cSHong Zhang } 13571fa1209cSHong Zhang } 135881d82fe4SHong Zhang } 135981d82fe4SHong Zhang } 136081d82fe4SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 136181d82fe4SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 136281d82fe4SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 13635df89d91SHong Zhang PetscFunctionReturn(0); 13645df89d91SHong Zhang } 13655df89d91SHong Zhang 13666718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 13676d373c3eSHong Zhang { 13686d373c3eSHong Zhang PetscErrorCode ierr; 13696718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data; 13706d373c3eSHong Zhang 13716d373c3eSHong Zhang PetscFunctionBegin; 13726d373c3eSHong Zhang ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 13736718818eSStefano Zampini if (atb->destroy) { 13746718818eSStefano Zampini ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr); 13756473ade5SStefano Zampini } 13766d373c3eSHong Zhang ierr = PetscFree(atb);CHKERRQ(ierr); 13776d373c3eSHong Zhang PetscFunctionReturn(0); 13786d373c3eSHong Zhang } 13796d373c3eSHong Zhang 13804222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 13815df89d91SHong Zhang { 1382bc011b1eSHong Zhang PetscErrorCode ierr; 1383089a957eSStefano Zampini Mat At = NULL; 138438baddfdSBarry Smith PetscInt *ati,*atj; 13854222ddf1SHong Zhang Mat_Product *product = C->product; 1386089a957eSStefano Zampini PetscBool flg,def,square; 1387bc011b1eSHong Zhang 1388bc011b1eSHong Zhang PetscFunctionBegin; 1389089a957eSStefano Zampini MatCheckProduct(C,4); 1390089a957eSStefano Zampini square = (PetscBool)(A == B && A->symmetric && A->symmetric_set); 13914222ddf1SHong Zhang /* outerproduct */ 1392089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr); 13934222ddf1SHong Zhang if (flg) { 1394bc011b1eSHong Zhang /* create symbolic At */ 1395089a957eSStefano Zampini if (!square) { 1396bc011b1eSHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13970298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 139833d57670SJed Brown ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 139904b858e0SBarry Smith ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1400089a957eSStefano Zampini } 1401bc011b1eSHong Zhang /* get symbolic C=At*B */ 14027a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1403089a957eSStefano Zampini ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 1404bc011b1eSHong Zhang 1405bc011b1eSHong Zhang /* clean up */ 1406089a957eSStefano Zampini if (!square) { 14076bf464f9SBarry Smith ierr = MatDestroy(&At);CHKERRQ(ierr); 1408bc011b1eSHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 1409089a957eSStefano Zampini } 14106d373c3eSHong Zhang 14114222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 14127a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr); 14134222ddf1SHong Zhang PetscFunctionReturn(0); 14144222ddf1SHong Zhang } 14154222ddf1SHong Zhang 14164222ddf1SHong Zhang /* matmatmult */ 1417089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr); 1418089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr); 1419089a957eSStefano Zampini if (flg || def) { 14204222ddf1SHong Zhang Mat_MatTransMatMult *atb; 14214222ddf1SHong Zhang 1422*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 14234222ddf1SHong Zhang ierr = PetscNew(&atb);CHKERRQ(ierr); 1424089a957eSStefano Zampini if (!square) { 14254222ddf1SHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 1426089a957eSStefano Zampini } 14277a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1428089a957eSStefano Zampini ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 14296718818eSStefano Zampini ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr); 14306718818eSStefano Zampini product->data = atb; 14316718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 14324222ddf1SHong Zhang atb->At = At; 14334222ddf1SHong Zhang atb->updateAt = PETSC_FALSE; /* because At is computed here */ 14344222ddf1SHong Zhang 14354222ddf1SHong Zhang C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 14364222ddf1SHong Zhang PetscFunctionReturn(0); 14374222ddf1SHong Zhang } 14384222ddf1SHong Zhang 14394222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1440bc011b1eSHong Zhang } 1441bc011b1eSHong Zhang 144275648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1443bc011b1eSHong Zhang { 1444bc011b1eSHong Zhang PetscErrorCode ierr; 14450fbc74f4SHong Zhang Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1446d0f46423SBarry Smith PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1447d0f46423SBarry Smith PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1448d13dce4bSSatish Balay PetscLogDouble flops=0.0; 1449aa1aec99SHong Zhang MatScalar *aa=a->a,*ba,*ca,*caj; 1450bc011b1eSHong Zhang 1451bc011b1eSHong Zhang PetscFunctionBegin; 1452aa1aec99SHong Zhang if (!c->a) { 1453580bdb30SBarry Smith ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 14542205254eSKarl Rupp 1455aa1aec99SHong Zhang c->a = ca; 1456aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1457aa1aec99SHong Zhang } else { 1458aa1aec99SHong Zhang ca = c->a; 1459580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 1460aa1aec99SHong Zhang } 1461bc011b1eSHong Zhang 1462bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1463bc011b1eSHong Zhang for (i=0; i<am; i++) { 1464bc011b1eSHong Zhang bj = b->j + bi[i]; 1465bc011b1eSHong Zhang ba = b->a + bi[i]; 1466bc011b1eSHong Zhang bnzi = bi[i+1] - bi[i]; 1467bc011b1eSHong Zhang anzi = ai[i+1] - ai[i]; 1468bc011b1eSHong Zhang for (j=0; j<anzi; j++) { 1469bc011b1eSHong Zhang nextb = 0; 14700fbc74f4SHong Zhang crow = *aj++; 1471bc011b1eSHong Zhang cjj = cj + ci[crow]; 1472bc011b1eSHong Zhang caj = ca + ci[crow]; 1473bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1474bc011b1eSHong Zhang for (k=0; nextb<bnzi; k++) { 14750fbc74f4SHong Zhang if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 14760fbc74f4SHong Zhang caj[k] += (*aa)*(*(ba+nextb)); 1477bc011b1eSHong Zhang nextb++; 1478bc011b1eSHong Zhang } 1479bc011b1eSHong Zhang } 1480bc011b1eSHong Zhang flops += 2*bnzi; 14810fbc74f4SHong Zhang aa++; 1482bc011b1eSHong Zhang } 1483bc011b1eSHong Zhang } 1484bc011b1eSHong Zhang 1485bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 1486bc011b1eSHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1487bc011b1eSHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1488bc011b1eSHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1489bc011b1eSHong Zhang PetscFunctionReturn(0); 1490bc011b1eSHong Zhang } 14919123193aSHong Zhang 14924222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 14939123193aSHong Zhang { 14949123193aSHong Zhang PetscErrorCode ierr; 14959123193aSHong Zhang 14969123193aSHong Zhang PetscFunctionBegin; 14975a586d82SBarry Smith ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 14984222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 14999123193aSHong Zhang PetscFunctionReturn(0); 15009123193aSHong Zhang } 15019123193aSHong Zhang 150293aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add) 15039123193aSHong Zhang { 1504f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1505612438f1SStefano Zampini Mat_SeqDense *bd=(Mat_SeqDense*)B->data; 15061ca9667aSStefano Zampini Mat_SeqDense *cd=(Mat_SeqDense*)C->data; 15079123193aSHong Zhang PetscErrorCode ierr; 15081ca9667aSStefano Zampini PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4; 1509a4af7ca8SStefano Zampini const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av; 1510daf57211SHong Zhang const PetscInt *aj; 1511612438f1SStefano Zampini PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 15121ca9667aSStefano Zampini PetscInt clda=cd->lda; 15131ca9667aSStefano Zampini PetscInt am4=4*clda,bm4=4*bm,col,i,j,n; 15149123193aSHong Zhang 15159123193aSHong Zhang PetscFunctionBegin; 1516f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 1517a4af7ca8SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr); 151893aa15f2SStefano Zampini if (add) { 15198c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 152093aa15f2SStefano Zampini } else { 152193aa15f2SStefano Zampini ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr); 152293aa15f2SStefano Zampini } 1523a4af7ca8SStefano Zampini ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1524f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 15251ca9667aSStefano Zampini c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda; 15261ca9667aSStefano Zampini for (col=0; col<(cn/4)*4; col += 4) { /* over columns of C */ 15271ca9667aSStefano Zampini for (i=0; i<am; i++) { /* over rows of A in those columns */ 1528f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1529f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1530f32d5d43SBarry Smith aj = a->j + a->i[i]; 1531a4af7ca8SStefano Zampini aa = av + a->i[i]; 1532f32d5d43SBarry Smith for (j=0; j<n; j++) { 15331ca9667aSStefano Zampini const PetscScalar aatmp = aa[j]; 15341ca9667aSStefano Zampini const PetscInt ajtmp = aj[j]; 1535730858b9SHong Zhang r1 += aatmp*b1[ajtmp]; 1536730858b9SHong Zhang r2 += aatmp*b2[ajtmp]; 1537730858b9SHong Zhang r3 += aatmp*b3[ajtmp]; 1538730858b9SHong Zhang r4 += aatmp*b4[ajtmp]; 15399123193aSHong Zhang } 154093aa15f2SStefano Zampini if (add) { 154187753ddeSHong Zhang c1[i] += r1; 154287753ddeSHong Zhang c2[i] += r2; 154387753ddeSHong Zhang c3[i] += r3; 154487753ddeSHong Zhang c4[i] += r4; 154593aa15f2SStefano Zampini } else { 154693aa15f2SStefano Zampini c1[i] = r1; 154793aa15f2SStefano Zampini c2[i] = r2; 154893aa15f2SStefano Zampini c3[i] = r3; 154993aa15f2SStefano Zampini c4[i] = r4; 155093aa15f2SStefano Zampini } 1551f32d5d43SBarry Smith } 1552730858b9SHong Zhang b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1553730858b9SHong Zhang c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1554f32d5d43SBarry Smith } 155593aa15f2SStefano Zampini /* process remaining columns */ 155693aa15f2SStefano Zampini if (col != cn) { 155793aa15f2SStefano Zampini PetscInt rc = cn-col; 155893aa15f2SStefano Zampini 155993aa15f2SStefano Zampini if (rc == 1) { 156093aa15f2SStefano Zampini for (i=0; i<am; i++) { 1561f32d5d43SBarry Smith r1 = 0.0; 1562f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1563f32d5d43SBarry Smith aj = a->j + a->i[i]; 1564a4af7ca8SStefano Zampini aa = av + a->i[i]; 156593aa15f2SStefano Zampini for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]]; 156693aa15f2SStefano Zampini if (add) c1[i] += r1; 156793aa15f2SStefano Zampini else c1[i] = r1; 156893aa15f2SStefano Zampini } 156993aa15f2SStefano Zampini } else if (rc == 2) { 157093aa15f2SStefano Zampini for (i=0; i<am; i++) { 157193aa15f2SStefano Zampini r1 = r2 = 0.0; 157293aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 157393aa15f2SStefano Zampini aj = a->j + a->i[i]; 157493aa15f2SStefano Zampini aa = av + a->i[i]; 1575f32d5d43SBarry Smith for (j=0; j<n; j++) { 157693aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 157793aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 157893aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 157993aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 1580f32d5d43SBarry Smith } 158193aa15f2SStefano Zampini if (add) { 158287753ddeSHong Zhang c1[i] += r1; 158393aa15f2SStefano Zampini c2[i] += r2; 158493aa15f2SStefano Zampini } else { 158593aa15f2SStefano Zampini c1[i] = r1; 158693aa15f2SStefano Zampini c2[i] = r2; 1587f32d5d43SBarry Smith } 158893aa15f2SStefano Zampini } 158993aa15f2SStefano Zampini } else { 159093aa15f2SStefano Zampini for (i=0; i<am; i++) { 159193aa15f2SStefano Zampini r1 = r2 = r3 = 0.0; 159293aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 159393aa15f2SStefano Zampini aj = a->j + a->i[i]; 159493aa15f2SStefano Zampini aa = av + a->i[i]; 159593aa15f2SStefano Zampini for (j=0; j<n; j++) { 159693aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 159793aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 159893aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 159993aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 160093aa15f2SStefano Zampini r3 += aatmp*b3[ajtmp]; 160193aa15f2SStefano Zampini } 160293aa15f2SStefano Zampini if (add) { 160393aa15f2SStefano Zampini c1[i] += r1; 160493aa15f2SStefano Zampini c2[i] += r2; 160593aa15f2SStefano Zampini c3[i] += r3; 160693aa15f2SStefano Zampini } else { 160793aa15f2SStefano Zampini c1[i] = r1; 160893aa15f2SStefano Zampini c2[i] = r2; 160993aa15f2SStefano Zampini c3[i] = r3; 161093aa15f2SStefano Zampini } 161193aa15f2SStefano Zampini } 161293aa15f2SStefano Zampini } 1613f32d5d43SBarry Smith } 1614dc0b31edSSatish Balay ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 161593aa15f2SStefano Zampini if (add) { 16168c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 161793aa15f2SStefano Zampini } else { 161893aa15f2SStefano Zampini ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr); 161993aa15f2SStefano Zampini } 1620a4af7ca8SStefano Zampini ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 1621a4af7ca8SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr); 1622f32d5d43SBarry Smith PetscFunctionReturn(0); 1623f32d5d43SBarry Smith } 1624f32d5d43SBarry Smith 162587753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1626f32d5d43SBarry Smith { 1627f32d5d43SBarry Smith PetscErrorCode ierr; 1628f32d5d43SBarry Smith 1629f32d5d43SBarry Smith PetscFunctionBegin; 1630*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 1631*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->rmap->n != C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 1632*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->cmap->n != C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 16334324174eSBarry Smith 163493aa15f2SStefano Zampini ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr); 16359123193aSHong Zhang PetscFunctionReturn(0); 16369123193aSHong Zhang } 1637b1683b59SHong Zhang 16384222ddf1SHong Zhang /* ------------------------------------------------------- */ 16394222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 16404222ddf1SHong Zhang { 16414222ddf1SHong Zhang PetscFunctionBegin; 16424222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 16434222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16444222ddf1SHong Zhang PetscFunctionReturn(0); 16454222ddf1SHong Zhang } 16464222ddf1SHong Zhang 16476718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat); 16486718818eSStefano Zampini 16494222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 16504222ddf1SHong Zhang { 16514222ddf1SHong Zhang PetscFunctionBegin; 16526718818eSStefano Zampini C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16534222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB; 16546718818eSStefano Zampini PetscFunctionReturn(0); 16556718818eSStefano Zampini } 16566718818eSStefano Zampini 16576718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 16586718818eSStefano Zampini { 16596718818eSStefano Zampini PetscFunctionBegin; 16606718818eSStefano Zampini C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16616718818eSStefano Zampini C->ops->productsymbolic = MatProductSymbolic_ABt; 16624222ddf1SHong Zhang PetscFunctionReturn(0); 16634222ddf1SHong Zhang } 16644222ddf1SHong Zhang 16654222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 16664222ddf1SHong Zhang { 16674222ddf1SHong Zhang PetscErrorCode ierr; 16684222ddf1SHong Zhang Mat_Product *product = C->product; 16694222ddf1SHong Zhang 16704222ddf1SHong Zhang PetscFunctionBegin; 16714222ddf1SHong Zhang switch (product->type) { 16724222ddf1SHong Zhang case MATPRODUCT_AB: 16734222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr); 16744222ddf1SHong Zhang break; 16754222ddf1SHong Zhang case MATPRODUCT_AtB: 16764222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr); 16774222ddf1SHong Zhang break; 16786718818eSStefano Zampini case MATPRODUCT_ABt: 16796718818eSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr); 16804222ddf1SHong Zhang break; 16814222ddf1SHong Zhang default: 16826718818eSStefano Zampini break; 16834222ddf1SHong Zhang } 16844222ddf1SHong Zhang PetscFunctionReturn(0); 16854222ddf1SHong Zhang } 16864222ddf1SHong Zhang /* ------------------------------------------------------- */ 16874222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 16884222ddf1SHong Zhang { 16894222ddf1SHong Zhang PetscErrorCode ierr; 16904222ddf1SHong Zhang Mat_Product *product = C->product; 16914222ddf1SHong Zhang Mat A = product->A; 16924222ddf1SHong Zhang PetscBool baij; 16934222ddf1SHong Zhang 16944222ddf1SHong Zhang PetscFunctionBegin; 16954222ddf1SHong Zhang ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr); 16964222ddf1SHong Zhang if (!baij) { /* A is seqsbaij */ 16974222ddf1SHong Zhang PetscBool sbaij; 16984222ddf1SHong Zhang ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr); 1699*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format"); 17004222ddf1SHong Zhang 17014222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 17024222ddf1SHong Zhang } else { /* A is seqbaij */ 17034222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 17044222ddf1SHong Zhang } 17054222ddf1SHong Zhang 17064222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17074222ddf1SHong Zhang PetscFunctionReturn(0); 17084222ddf1SHong Zhang } 17094222ddf1SHong Zhang 17104222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 17114222ddf1SHong Zhang { 17124222ddf1SHong Zhang PetscErrorCode ierr; 17134222ddf1SHong Zhang Mat_Product *product = C->product; 17144222ddf1SHong Zhang 17154222ddf1SHong Zhang PetscFunctionBegin; 17166718818eSStefano Zampini MatCheckProduct(C,1); 1717*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 17186718818eSStefano Zampini if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) { 17194222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr); 17206718818eSStefano Zampini } 17214222ddf1SHong Zhang PetscFunctionReturn(0); 17224222ddf1SHong Zhang } 17236718818eSStefano Zampini 17244222ddf1SHong Zhang /* ------------------------------------------------------- */ 17254222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 17264222ddf1SHong Zhang { 17274222ddf1SHong Zhang PetscFunctionBegin; 17284222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 17294222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17304222ddf1SHong Zhang PetscFunctionReturn(0); 17314222ddf1SHong Zhang } 17324222ddf1SHong Zhang 17334222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 17344222ddf1SHong Zhang { 17354222ddf1SHong Zhang PetscErrorCode ierr; 17364222ddf1SHong Zhang Mat_Product *product = C->product; 17374222ddf1SHong Zhang 17384222ddf1SHong Zhang PetscFunctionBegin; 17394222ddf1SHong Zhang if (product->type == MATPRODUCT_AB) { 17404222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr); 17416718818eSStefano Zampini } 17424222ddf1SHong Zhang PetscFunctionReturn(0); 17434222ddf1SHong Zhang } 17444222ddf1SHong Zhang /* ------------------------------------------------------- */ 17454222ddf1SHong Zhang 1746b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1747c8db22f6SHong Zhang { 1748c8db22f6SHong Zhang PetscErrorCode ierr; 17492f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 17502f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 17512f5f1f90SHong Zhang PetscInt *bi = b->i,*bj=b->j; 17522f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 17532f5f1f90SHong Zhang MatScalar *btval,*btval_den,*ba=b->a; 17542f5f1f90SHong Zhang PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1755c8db22f6SHong Zhang 1756c8db22f6SHong Zhang PetscFunctionBegin; 17572f5f1f90SHong Zhang btval_den=btdense->v; 1758580bdb30SBarry Smith ierr = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr); 17592f5f1f90SHong Zhang for (k=0; k<ncolors; k++) { 17602f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 17612f5f1f90SHong Zhang for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1762d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 17632f5f1f90SHong Zhang btcol = bj + bi[col]; 17642f5f1f90SHong Zhang btval = ba + bi[col]; 17652f5f1f90SHong Zhang anz = bi[col+1] - bi[col]; 1766d2853540SHong Zhang for (j=0; j<anz; j++) { 17672f5f1f90SHong Zhang brow = btcol[j]; 17682f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1769c8db22f6SHong Zhang } 1770c8db22f6SHong Zhang } 17712f5f1f90SHong Zhang btval_den += m; 1772c8db22f6SHong Zhang } 1773c8db22f6SHong Zhang PetscFunctionReturn(0); 1774c8db22f6SHong Zhang } 1775c8db22f6SHong Zhang 1776b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 17778972f759SHong Zhang { 17788972f759SHong Zhang PetscErrorCode ierr; 1779b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 17801683a169SBarry Smith const PetscScalar *ca_den,*ca_den_ptr; 17811683a169SBarry Smith PetscScalar *ca=csp->a; 1782f99a636bSHong Zhang PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1783e88777f2SHong Zhang PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1784077f23c2SHong Zhang PetscInt nrows,*row,*idx; 1785077f23c2SHong Zhang PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 17868972f759SHong Zhang 17878972f759SHong Zhang PetscFunctionBegin; 17881683a169SBarry Smith ierr = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1789f99a636bSHong Zhang 1790077f23c2SHong Zhang if (brows > 0) { 1791077f23c2SHong Zhang PetscInt *lstart,row_end,row_start; 1792980ae229SHong Zhang lstart = matcoloring->lstart; 1793580bdb30SBarry Smith ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr); 1794eeb4fd02SHong Zhang 1795077f23c2SHong Zhang row_end = brows; 1796eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1797077f23c2SHong Zhang for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1798077f23c2SHong Zhang ca_den_ptr = ca_den; 1799980ae229SHong Zhang for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1800eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1801eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1802eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1803eeb4fd02SHong Zhang for (l=lstart[k]; l<nrows; l++) { 1804eeb4fd02SHong Zhang if (row[l] >= row_end) { 1805eeb4fd02SHong Zhang lstart[k] = l; 1806eeb4fd02SHong Zhang break; 1807eeb4fd02SHong Zhang } else { 1808077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1809eeb4fd02SHong Zhang } 1810eeb4fd02SHong Zhang } 1811077f23c2SHong Zhang ca_den_ptr += m; 1812eeb4fd02SHong Zhang } 1813077f23c2SHong Zhang row_end += brows; 1814eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1815eeb4fd02SHong Zhang } 1816077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1817077f23c2SHong Zhang ca_den_ptr = ca_den; 1818077f23c2SHong Zhang for (k=0; k<ncolors; k++) { 1819077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1820077f23c2SHong Zhang row = rows + colorforrow[k]; 1821077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1822077f23c2SHong Zhang for (l=0; l<nrows; l++) { 1823077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1824077f23c2SHong Zhang } 1825077f23c2SHong Zhang ca_den_ptr += m; 1826077f23c2SHong Zhang } 1827f99a636bSHong Zhang } 1828f99a636bSHong Zhang 18291683a169SBarry Smith ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1830f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1831077f23c2SHong Zhang if (matcoloring->brows > 0) { 18327d3de750SJacob Faibussowitsch ierr = PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows);CHKERRQ(ierr); 1833e88777f2SHong Zhang } else { 1834077f23c2SHong Zhang ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1835e88777f2SHong Zhang } 1836f99a636bSHong Zhang #endif 18378972f759SHong Zhang PetscFunctionReturn(0); 18388972f759SHong Zhang } 18398972f759SHong Zhang 1840b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1841b1683b59SHong Zhang { 1842b1683b59SHong Zhang PetscErrorCode ierr; 1843e88777f2SHong Zhang PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 18441a83f524SJed Brown const PetscInt *is,*ci,*cj,*row_idx; 1845b28d80bdSHong Zhang PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1846b1683b59SHong Zhang IS *isa; 1847b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1848e88777f2SHong Zhang PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1849e88777f2SHong Zhang PetscInt *colorforcol,*columns,*columns_i,brows; 1850e88777f2SHong Zhang PetscBool flg; 1851b1683b59SHong Zhang 1852b1683b59SHong Zhang PetscFunctionBegin; 1853071fcb05SBarry Smith ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1854e99be685SHong Zhang 18557ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1856e88777f2SHong Zhang Nbs = mat->cmap->N/bs; 1857b1683b59SHong Zhang c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1858e88777f2SHong Zhang c->N = Nbs; 1859e88777f2SHong Zhang c->m = c->M; 1860b1683b59SHong Zhang c->rstart = 0; 1861e88777f2SHong Zhang c->brows = 100; 1862b1683b59SHong Zhang 1863b1683b59SHong Zhang c->ncolors = nis; 1864dcca6d9dSJed Brown ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1865854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1866854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1867e88777f2SHong Zhang 1868e88777f2SHong Zhang brows = c->brows; 1869c5929fdfSBarry Smith ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1870e88777f2SHong Zhang if (flg) c->brows = brows; 1871eeb4fd02SHong Zhang if (brows > 0) { 1872854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1873e88777f2SHong Zhang } 18742205254eSKarl Rupp 1875d2853540SHong Zhang colorforrow[0] = 0; 1876d2853540SHong Zhang rows_i = rows; 1877f99a636bSHong Zhang den2sp_i = den2sp; 1878b1683b59SHong Zhang 1879854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1880854ce69bSBarry Smith ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 18812205254eSKarl Rupp 1882d2853540SHong Zhang colorforcol[0] = 0; 1883d2853540SHong Zhang columns_i = columns; 1884d2853540SHong Zhang 1885077f23c2SHong Zhang /* get column-wise storage of mat */ 1886077f23c2SHong Zhang ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1887b1683b59SHong Zhang 1888b28d80bdSHong Zhang cm = c->m; 1889854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1890854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1891f99a636bSHong Zhang for (i=0; i<nis; i++) { /* loop over color */ 1892b1683b59SHong Zhang ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1893b1683b59SHong Zhang ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 18942205254eSKarl Rupp 1895b1683b59SHong Zhang c->ncolumns[i] = n; 1896b1683b59SHong Zhang if (n) { 1897580bdb30SBarry Smith ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr); 1898b1683b59SHong Zhang } 1899d2853540SHong Zhang colorforcol[i+1] = colorforcol[i] + n; 1900d2853540SHong Zhang columns_i += n; 1901b1683b59SHong Zhang 1902b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 1903580bdb30SBarry Smith ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr); 1904e99be685SHong Zhang 1905b7caf3d6SHong Zhang for (j=0; j<n; j++) { /* loop over columns*/ 1906b1683b59SHong Zhang col = is[j]; 1907d2853540SHong Zhang row_idx = cj + ci[col]; 1908b1683b59SHong Zhang m = ci[col+1] - ci[col]; 1909b7caf3d6SHong Zhang for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1910e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1911d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1912b1683b59SHong Zhang } 1913b1683b59SHong Zhang } 1914b1683b59SHong Zhang /* count the number of hits */ 1915b1683b59SHong Zhang nrows = 0; 1916e8354b3eSHong Zhang for (j=0; j<cm; j++) { 1917b1683b59SHong Zhang if (rowhit[j]) nrows++; 1918b1683b59SHong Zhang } 1919b1683b59SHong Zhang c->nrows[i] = nrows; 1920d2853540SHong Zhang colorforrow[i+1] = colorforrow[i] + nrows; 1921d2853540SHong Zhang 1922b1683b59SHong Zhang nrows = 0; 1923b7caf3d6SHong Zhang for (j=0; j<cm; j++) { /* loop over rows */ 1924b1683b59SHong Zhang if (rowhit[j]) { 1925d2853540SHong Zhang rows_i[nrows] = j; 192612b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1927b1683b59SHong Zhang nrows++; 1928b1683b59SHong Zhang } 1929b1683b59SHong Zhang } 1930e88777f2SHong Zhang den2sp_i += nrows; 1931077f23c2SHong Zhang 1932b1683b59SHong Zhang ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1933f99a636bSHong Zhang rows_i += nrows; 1934b1683b59SHong Zhang } 19350298fd71SBarry Smith ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1936b28d80bdSHong Zhang ierr = PetscFree(rowhit);CHKERRQ(ierr); 1937071fcb05SBarry Smith ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr); 1938*2c71b3e2SJacob Faibussowitsch PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]); 1939b28d80bdSHong Zhang 1940d2853540SHong Zhang c->colorforrow = colorforrow; 1941d2853540SHong Zhang c->rows = rows; 1942f99a636bSHong Zhang c->den2sp = den2sp; 1943d2853540SHong Zhang c->colorforcol = colorforcol; 1944d2853540SHong Zhang c->columns = columns; 19452205254eSKarl Rupp 1946f94ccd6cSHong Zhang ierr = PetscFree(idxhit);CHKERRQ(ierr); 1947b1683b59SHong Zhang PetscFunctionReturn(0); 1948b1683b59SHong Zhang } 1949d013fc79Sandi selinger 19504222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19514222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1952df97dc6dSFande Kong { 19534222ddf1SHong Zhang PetscErrorCode ierr; 19544222ddf1SHong Zhang Mat_Product *product = C->product; 19554222ddf1SHong Zhang Mat A=product->A,B=product->B; 19564222ddf1SHong Zhang 1957df97dc6dSFande Kong PetscFunctionBegin; 19584222ddf1SHong Zhang if (C->ops->mattransposemultnumeric) { 19594222ddf1SHong Zhang /* Alg: "outerproduct" */ 19606718818eSStefano Zampini ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr); 19614222ddf1SHong Zhang } else { 19624222ddf1SHong Zhang /* Alg: "matmatmult" -- C = At*B */ 19636718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 19646718818eSStefano Zampini Mat At; 19654222ddf1SHong Zhang 1966*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 19676718818eSStefano Zampini At = atb->At; 1968089a957eSStefano Zampini if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */ 19694222ddf1SHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 19704222ddf1SHong Zhang } 1971089a957eSStefano Zampini ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr); 19724222ddf1SHong Zhang atb->updateAt = PETSC_TRUE; 19734222ddf1SHong Zhang } 1974df97dc6dSFande Kong PetscFunctionReturn(0); 1975df97dc6dSFande Kong } 1976df97dc6dSFande Kong 19774222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1978d013fc79Sandi selinger { 1979d013fc79Sandi selinger PetscErrorCode ierr; 19804222ddf1SHong Zhang Mat_Product *product = C->product; 19814222ddf1SHong Zhang Mat A=product->A,B=product->B; 19824222ddf1SHong Zhang PetscReal fill=product->fill; 1983d013fc79Sandi selinger 1984d013fc79Sandi selinger PetscFunctionBegin; 19854222ddf1SHong Zhang ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 19862869b61bSandi selinger 19874222ddf1SHong Zhang C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 19884222ddf1SHong Zhang PetscFunctionReturn(0); 19892869b61bSandi selinger } 1990d013fc79Sandi selinger 19914222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19924222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 19934222ddf1SHong Zhang { 19944222ddf1SHong Zhang PetscErrorCode ierr; 19954222ddf1SHong Zhang Mat_Product *product = C->product; 19964222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 19974222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 19984222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 19994222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 20004222ddf1SHong Zhang PetscInt nalg = 7; 20014222ddf1SHong Zhang #else 20024222ddf1SHong Zhang const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"}; 20034222ddf1SHong Zhang PetscInt nalg = 8; 20044222ddf1SHong Zhang #endif 20054222ddf1SHong Zhang 20064222ddf1SHong Zhang PetscFunctionBegin; 20074222ddf1SHong Zhang /* Set default algorithm */ 20084222ddf1SHong Zhang ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 20094222ddf1SHong Zhang if (flg) { 20104222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2011d013fc79Sandi selinger } 2012d013fc79Sandi selinger 20134222ddf1SHong Zhang /* Get runtime option */ 20144222ddf1SHong Zhang if (product->api_user) { 20154222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 20164222ddf1SHong Zhang ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 20174222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20184222ddf1SHong Zhang } else { 20194222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 20203e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 20214222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 2022d013fc79Sandi selinger } 20234222ddf1SHong Zhang if (flg) { 20244222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2025d013fc79Sandi selinger } 2026d013fc79Sandi selinger 20274222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 20284222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 20294222ddf1SHong Zhang PetscFunctionReturn(0); 20304222ddf1SHong Zhang } 2031d013fc79Sandi selinger 20324222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 20334222ddf1SHong Zhang { 20344222ddf1SHong Zhang PetscErrorCode ierr; 20354222ddf1SHong Zhang Mat_Product *product = C->product; 20364222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20374222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 2038089a957eSStefano Zampini const char *algTypes[3] = {"default","at*b","outerproduct"}; 2039089a957eSStefano Zampini PetscInt nalg = 3; 2040d013fc79Sandi selinger 20414222ddf1SHong Zhang PetscFunctionBegin; 20424222ddf1SHong Zhang /* Get runtime option */ 20434222ddf1SHong Zhang if (product->api_user) { 20444222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 20454222ddf1SHong Zhang ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20464222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20474222ddf1SHong Zhang } else { 20484222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 20493e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20504222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20514222ddf1SHong Zhang } 20524222ddf1SHong Zhang if (flg) { 20534222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20544222ddf1SHong Zhang } 2055d013fc79Sandi selinger 20564222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 20574222ddf1SHong Zhang PetscFunctionReturn(0); 20584222ddf1SHong Zhang } 20594222ddf1SHong Zhang 20604222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 20614222ddf1SHong Zhang { 20624222ddf1SHong Zhang PetscErrorCode ierr; 20634222ddf1SHong Zhang Mat_Product *product = C->product; 20644222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20654222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20664222ddf1SHong Zhang const char *algTypes[2] = {"default","color"}; 20674222ddf1SHong Zhang PetscInt nalg = 2; 20684222ddf1SHong Zhang 20694222ddf1SHong Zhang PetscFunctionBegin; 20704222ddf1SHong Zhang /* Set default algorithm */ 20714222ddf1SHong Zhang ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 20724222ddf1SHong Zhang if (!flg) { 20734222ddf1SHong Zhang alg = 1; 20744222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20754222ddf1SHong Zhang } 20764222ddf1SHong Zhang 20774222ddf1SHong Zhang /* Get runtime option */ 20784222ddf1SHong Zhang if (product->api_user) { 20794222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr); 20804222ddf1SHong Zhang ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20814222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20824222ddf1SHong Zhang } else { 20834222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr); 20843e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20854222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20864222ddf1SHong Zhang } 20874222ddf1SHong Zhang if (flg) { 20884222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20894222ddf1SHong Zhang } 20904222ddf1SHong Zhang 20914222ddf1SHong Zhang C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 20924222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABt; 20934222ddf1SHong Zhang PetscFunctionReturn(0); 20944222ddf1SHong Zhang } 20954222ddf1SHong Zhang 20964222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 20974222ddf1SHong Zhang { 20984222ddf1SHong Zhang PetscErrorCode ierr; 20994222ddf1SHong Zhang Mat_Product *product = C->product; 21004222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21014222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 21024222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 21034222ddf1SHong Zhang const char *algTypes[2] = {"scalable","rap"}; 21044222ddf1SHong Zhang PetscInt nalg = 2; 21054222ddf1SHong Zhang #else 21064222ddf1SHong Zhang const char *algTypes[3] = {"scalable","rap","hypre"}; 21074222ddf1SHong Zhang PetscInt nalg = 3; 21084222ddf1SHong Zhang #endif 21094222ddf1SHong Zhang 21104222ddf1SHong Zhang PetscFunctionBegin; 21114222ddf1SHong Zhang /* Set default algorithm */ 21124222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21134222ddf1SHong Zhang if (flg) { 21144222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21154222ddf1SHong Zhang } 21164222ddf1SHong Zhang 21174222ddf1SHong Zhang /* Get runtime option */ 21184222ddf1SHong Zhang if (product->api_user) { 21194222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 21204222ddf1SHong Zhang ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21214222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21224222ddf1SHong Zhang } else { 21234222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 21243e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21254222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21264222ddf1SHong Zhang } 21274222ddf1SHong Zhang if (flg) { 21284222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21294222ddf1SHong Zhang } 21304222ddf1SHong Zhang 21314222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 21324222ddf1SHong Zhang PetscFunctionReturn(0); 21334222ddf1SHong Zhang } 21344222ddf1SHong Zhang 21354222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 21364222ddf1SHong Zhang { 21374222ddf1SHong Zhang PetscErrorCode ierr; 21384222ddf1SHong Zhang Mat_Product *product = C->product; 21394222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21404222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21414222ddf1SHong Zhang const char *algTypes[3] = {"r*a*rt","r*art","coloring_rart"}; 21424222ddf1SHong Zhang PetscInt nalg = 3; 21434222ddf1SHong Zhang 21444222ddf1SHong Zhang PetscFunctionBegin; 21454222ddf1SHong Zhang /* Set default algorithm */ 21464222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21474222ddf1SHong Zhang if (flg) { 21484222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21494222ddf1SHong Zhang } 21504222ddf1SHong Zhang 21514222ddf1SHong Zhang /* Get runtime option */ 21524222ddf1SHong Zhang if (product->api_user) { 21534222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 21544222ddf1SHong Zhang ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21554222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21564222ddf1SHong Zhang } else { 21574222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 21583e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21594222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21604222ddf1SHong Zhang } 21614222ddf1SHong Zhang if (flg) { 21624222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21634222ddf1SHong Zhang } 21644222ddf1SHong Zhang 21654222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 21664222ddf1SHong Zhang PetscFunctionReturn(0); 21674222ddf1SHong Zhang } 21684222ddf1SHong Zhang 21694222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 21704222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 21714222ddf1SHong Zhang { 21724222ddf1SHong Zhang PetscErrorCode ierr; 21734222ddf1SHong Zhang Mat_Product *product = C->product; 21744222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21754222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21764222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 21774222ddf1SHong Zhang PetscInt nalg = 7; 21784222ddf1SHong Zhang 21794222ddf1SHong Zhang PetscFunctionBegin; 21804222ddf1SHong Zhang /* Set default algorithm */ 21814222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21824222ddf1SHong Zhang if (flg) { 21834222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21844222ddf1SHong Zhang } 21854222ddf1SHong Zhang 21864222ddf1SHong Zhang /* Get runtime option */ 21874222ddf1SHong Zhang if (product->api_user) { 21884222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 21894222ddf1SHong Zhang ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 21904222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21914222ddf1SHong Zhang } else { 21924222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 21933e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 21944222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21954222ddf1SHong Zhang } 21964222ddf1SHong Zhang if (flg) { 21974222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21984222ddf1SHong Zhang } 21994222ddf1SHong Zhang 22004222ddf1SHong Zhang C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 22014222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABC; 22024222ddf1SHong Zhang PetscFunctionReturn(0); 22034222ddf1SHong Zhang } 22044222ddf1SHong Zhang 22054222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 22064222ddf1SHong Zhang { 22074222ddf1SHong Zhang PetscErrorCode ierr; 22084222ddf1SHong Zhang Mat_Product *product = C->product; 22094222ddf1SHong Zhang 22104222ddf1SHong Zhang PetscFunctionBegin; 22114222ddf1SHong Zhang switch (product->type) { 22124222ddf1SHong Zhang case MATPRODUCT_AB: 22134222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr); 22144222ddf1SHong Zhang break; 22154222ddf1SHong Zhang case MATPRODUCT_AtB: 22164222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr); 22174222ddf1SHong Zhang break; 22184222ddf1SHong Zhang case MATPRODUCT_ABt: 22194222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr); 22204222ddf1SHong Zhang break; 22214222ddf1SHong Zhang case MATPRODUCT_PtAP: 22224222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr); 22234222ddf1SHong Zhang break; 22244222ddf1SHong Zhang case MATPRODUCT_RARt: 22254222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr); 22264222ddf1SHong Zhang break; 22274222ddf1SHong Zhang case MATPRODUCT_ABC: 22284222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr); 22294222ddf1SHong Zhang break; 22306718818eSStefano Zampini default: 22316718818eSStefano Zampini break; 22324222ddf1SHong Zhang } 2233d013fc79Sandi selinger PetscFunctionReturn(0); 2234d013fc79Sandi selinger } 2235