1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 14df97dc6dSFande Kong { 15df97dc6dSFande Kong PetscErrorCode ierr; 16df97dc6dSFande Kong 17df97dc6dSFande Kong PetscFunctionBegin; 18df97dc6dSFande Kong if (C->ops->matmultnumeric) { 196718818eSStefano Zampini if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call"); 20df97dc6dSFande Kong ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr); 21df97dc6dSFande Kong } else { 22df97dc6dSFande Kong ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr); 23df97dc6dSFande Kong } 24df97dc6dSFande Kong PetscFunctionReturn(0); 25df97dc6dSFande Kong } 26df97dc6dSFande Kong 274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */ 28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat) 29df97dc6dSFande Kong { 30df97dc6dSFande Kong PetscErrorCode ierr; 314222ddf1SHong Zhang PetscInt ii; 324222ddf1SHong Zhang Mat_SeqAIJ *aij; 33e4e71118SStefano Zampini PetscBool isseqaij; 345c66b693SKris Buschelman 355c66b693SKris Buschelman PetscFunctionBegin; 364222ddf1SHong Zhang if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0"); 374222ddf1SHong Zhang ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr); 384222ddf1SHong Zhang 39e4e71118SStefano Zampini if (!mtype) { 40e4e71118SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 41e4e71118SStefano Zampini if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); } 42e4e71118SStefano Zampini } else { 43e4e71118SStefano Zampini ierr = MatSetType(mat,mtype);CHKERRQ(ierr); 44e4e71118SStefano Zampini } 45f4259b30SLisandro Dalcin ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 464222ddf1SHong Zhang aij = (Mat_SeqAIJ*)(mat)->data; 474222ddf1SHong Zhang ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr); 484222ddf1SHong Zhang ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr); 494222ddf1SHong Zhang 504222ddf1SHong Zhang aij->i = i; 514222ddf1SHong Zhang aij->j = j; 524222ddf1SHong Zhang aij->a = a; 534222ddf1SHong Zhang aij->singlemalloc = PETSC_FALSE; 544222ddf1SHong Zhang aij->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 554222ddf1SHong Zhang aij->free_a = PETSC_FALSE; 564222ddf1SHong Zhang aij->free_ij = PETSC_FALSE; 574222ddf1SHong Zhang 584222ddf1SHong Zhang for (ii=0; ii<m; ii++) { 594222ddf1SHong Zhang aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii]; 6025023028SHong Zhang } 614222ddf1SHong Zhang 625c66b693SKris Buschelman PetscFunctionReturn(0); 635c66b693SKris Buschelman } 641c24bd37SHong Zhang 654222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 664222ddf1SHong Zhang { 674222ddf1SHong Zhang PetscErrorCode ierr; 684222ddf1SHong Zhang Mat_Product *product = C->product; 694222ddf1SHong Zhang MatProductAlgorithm alg; 704222ddf1SHong Zhang PetscBool flg; 714222ddf1SHong Zhang 724222ddf1SHong Zhang PetscFunctionBegin; 734222ddf1SHong Zhang if (product) { 744222ddf1SHong Zhang alg = product->alg; 754222ddf1SHong Zhang } else { 764222ddf1SHong Zhang alg = "sorted"; 774222ddf1SHong Zhang } 784222ddf1SHong Zhang /* sorted */ 794222ddf1SHong Zhang ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr); 804222ddf1SHong Zhang if (flg) { 814222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr); 824222ddf1SHong Zhang PetscFunctionReturn(0); 834222ddf1SHong Zhang } 844222ddf1SHong Zhang 854222ddf1SHong Zhang /* scalable */ 864222ddf1SHong Zhang ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 874222ddf1SHong Zhang if (flg) { 884222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 894222ddf1SHong Zhang PetscFunctionReturn(0); 904222ddf1SHong Zhang } 914222ddf1SHong Zhang 924222ddf1SHong Zhang /* scalable_fast */ 934222ddf1SHong Zhang ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr); 944222ddf1SHong Zhang if (flg) { 954222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 964222ddf1SHong Zhang PetscFunctionReturn(0); 974222ddf1SHong Zhang } 984222ddf1SHong Zhang 994222ddf1SHong Zhang /* heap */ 1004222ddf1SHong Zhang ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr); 1014222ddf1SHong Zhang if (flg) { 1024222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 1034222ddf1SHong Zhang PetscFunctionReturn(0); 1044222ddf1SHong Zhang } 1054222ddf1SHong Zhang 1064222ddf1SHong Zhang /* btheap */ 1074222ddf1SHong Zhang ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr); 1084222ddf1SHong Zhang if (flg) { 1094222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 1104222ddf1SHong Zhang PetscFunctionReturn(0); 1114222ddf1SHong Zhang } 1124222ddf1SHong Zhang 1134222ddf1SHong Zhang /* llcondensed */ 1144222ddf1SHong Zhang ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr); 1154222ddf1SHong Zhang if (flg) { 1164222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 1174222ddf1SHong Zhang PetscFunctionReturn(0); 1184222ddf1SHong Zhang } 1194222ddf1SHong Zhang 1204222ddf1SHong Zhang /* rowmerge */ 1214222ddf1SHong Zhang ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr); 1224222ddf1SHong Zhang if (flg) { 1234222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 1244222ddf1SHong Zhang PetscFunctionReturn(0); 1254222ddf1SHong Zhang } 1264222ddf1SHong Zhang 1274222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE) 1284222ddf1SHong Zhang ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 1294222ddf1SHong Zhang if (flg) { 1304222ddf1SHong Zhang ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 1314222ddf1SHong Zhang PetscFunctionReturn(0); 1324222ddf1SHong Zhang } 1334222ddf1SHong Zhang #endif 1344222ddf1SHong Zhang 1354222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1364222ddf1SHong Zhang } 1374222ddf1SHong Zhang 1384222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C) 139b561aa9dSHong Zhang { 140b561aa9dSHong Zhang PetscErrorCode ierr; 141b561aa9dSHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 142971236abSHong Zhang PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 143c1ba5575SJed Brown PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 144b561aa9dSHong Zhang PetscReal afill; 145eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 146eca6b86aSHong Zhang PetscTable ta; 147fb908031SHong Zhang PetscBT lnkbt; 1480298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 149b561aa9dSHong Zhang 150b561aa9dSHong Zhang PetscFunctionBegin; 151bd958071SHong Zhang /* Get ci and cj */ 152bd958071SHong Zhang /*---------------*/ 153fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 154fb908031SHong Zhang /* free space for accumulating nonzero column info */ 155854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 156fb908031SHong Zhang ci[0] = 0; 157fb908031SHong Zhang 158fb908031SHong Zhang /* create and initialize a linked list */ 159c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 160c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 161eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 162eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 163eca6b86aSHong Zhang 164eca6b86aSHong Zhang ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 165fb908031SHong Zhang 166fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 167f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 1682205254eSKarl Rupp 169fb908031SHong Zhang current_space = free_space; 170fb908031SHong Zhang 171fb908031SHong Zhang /* Determine ci and cj */ 172fb908031SHong Zhang for (i=0; i<am; i++) { 173fb908031SHong Zhang anzi = ai[i+1] - ai[i]; 174fb908031SHong Zhang aj = a->j + ai[i]; 175fb908031SHong Zhang for (j=0; j<anzi; j++) { 176fb908031SHong Zhang brow = aj[j]; 177fb908031SHong Zhang bnzj = bi[brow+1] - bi[brow]; 178fb908031SHong Zhang bj = b->j + bi[brow]; 179fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 180fb908031SHong Zhang ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 181fb908031SHong Zhang } 1828c78258cSHong Zhang /* add possible missing diagonal entry */ 1838c78258cSHong Zhang if (C->force_diagonals) { 1848c78258cSHong Zhang ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr); 1858c78258cSHong Zhang } 186fb908031SHong Zhang cnzi = lnk[0]; 187fb908031SHong Zhang 188fb908031SHong Zhang /* If free space is not available, make more free space */ 189fb908031SHong Zhang /* Double the amount of total space in the list */ 190fb908031SHong Zhang if (current_space->local_remaining<cnzi) { 191f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 192fb908031SHong Zhang ndouble++; 193fb908031SHong Zhang } 194fb908031SHong Zhang 195fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 196fb908031SHong Zhang ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1972205254eSKarl Rupp 198fb908031SHong Zhang current_space->array += cnzi; 199fb908031SHong Zhang current_space->local_used += cnzi; 200fb908031SHong Zhang current_space->local_remaining -= cnzi; 2012205254eSKarl Rupp 202fb908031SHong Zhang ci[i+1] = ci[i] + cnzi; 203fb908031SHong Zhang } 204fb908031SHong Zhang 205fb908031SHong Zhang /* Column indices are in the list of free space */ 206fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 207fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 208854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 209fb908031SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 210fb908031SHong Zhang ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 211b561aa9dSHong Zhang 21226be0446SHong Zhang /* put together the new symbolic matrix */ 213e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 2144222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 21558c24d83SHong Zhang 21658c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 21758c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 2184222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 219aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 220e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 22158c24d83SHong Zhang c->nonew = 0; 2224222ddf1SHong Zhang 2234222ddf1SHong Zhang /* fast, needs non-scalable O(bn) array 'abdense' */ 2244222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 2250b7e3e3dSHong Zhang 2267212da7cSHong Zhang /* set MatInfo */ 2277212da7cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 228f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 2297212da7cSHong Zhang c->maxnz = ci[am]; 2307212da7cSHong Zhang c->nz = ci[am]; 2314222ddf1SHong Zhang C->info.mallocs = ndouble; 2324222ddf1SHong Zhang C->info.fill_ratio_given = fill; 2334222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 2347212da7cSHong Zhang 2357212da7cSHong Zhang #if defined(PETSC_USE_INFO) 2367212da7cSHong Zhang if (ci[am]) { 2374222ddf1SHong Zhang ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 2384222ddf1SHong Zhang ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 239f2b054eeSHong Zhang } else { 2404222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 241be0fcf8dSHong Zhang } 242f2b054eeSHong Zhang #endif 24358c24d83SHong Zhang PetscFunctionReturn(0); 24458c24d83SHong Zhang } 245d50806bdSBarry Smith 246df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 247d50806bdSBarry Smith { 248dfbe8321SBarry Smith PetscErrorCode ierr; 249d13dce4bSSatish Balay PetscLogDouble flops=0.0; 250d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 251d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 252d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 25338baddfdSBarry Smith PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 254c58ca2e3SHong Zhang PetscInt am =A->rmap->n,cm=C->rmap->n; 255519eb980SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 2562e5835c6SStefano Zampini PetscScalar *ca,valtmp; 257aa1aec99SHong Zhang PetscScalar *ab_dense; 2586718818eSStefano Zampini PetscContainer cab_dense; 2592e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 260d50806bdSBarry Smith 261d50806bdSBarry Smith PetscFunctionBegin; 2622e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 2632e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 264aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 265854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 266aa1aec99SHong Zhang c->a = ca; 267aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 2686718818eSStefano Zampini } else ca = c->a; 2696718818eSStefano Zampini 2706718818eSStefano Zampini /* TODO this should be done in the symbolic phase */ 2716718818eSStefano Zampini /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 2726718818eSStefano Zampini that is hard to eradicate) */ 2736718818eSStefano Zampini ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr); 2746718818eSStefano Zampini if (!cab_dense) { 2756718818eSStefano Zampini ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 2766718818eSStefano Zampini ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr); 2776718818eSStefano Zampini ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr); 2786718818eSStefano Zampini ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr); 2796718818eSStefano Zampini ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr); 2806718818eSStefano Zampini ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr); 281d90d86d0SMatthew G. Knepley } 2826718818eSStefano Zampini ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr); 2836718818eSStefano Zampini ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr); 284aa1aec99SHong Zhang 285c124e916SHong Zhang /* clean old values in C */ 286580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 287d50806bdSBarry Smith /* Traverse A row-wise. */ 288d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 289d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 290d50806bdSBarry Smith for (i=0; i<am; i++) { 291d50806bdSBarry Smith anzi = ai[i+1] - ai[i]; 292d50806bdSBarry Smith for (j=0; j<anzi; j++) { 293519eb980SHong Zhang brow = aj[j]; 294d50806bdSBarry Smith bnzi = bi[brow+1] - bi[brow]; 295d50806bdSBarry Smith bjj = bj + bi[brow]; 296d50806bdSBarry Smith baj = ba + bi[brow]; 297519eb980SHong Zhang /* perform dense axpy */ 29836ec6d2dSHong Zhang valtmp = aa[j]; 299519eb980SHong Zhang for (k=0; k<bnzi; k++) { 30036ec6d2dSHong Zhang ab_dense[bjj[k]] += valtmp*baj[k]; 301519eb980SHong Zhang } 302519eb980SHong Zhang flops += 2*bnzi; 303519eb980SHong Zhang } 304c58ca2e3SHong Zhang aj += anzi; aa += anzi; 305c58ca2e3SHong Zhang 306c58ca2e3SHong Zhang cnzi = ci[i+1] - ci[i]; 307519eb980SHong Zhang for (k=0; k<cnzi; k++) { 308519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 309519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 310519eb980SHong Zhang } 311519eb980SHong Zhang flops += cnzi; 312519eb980SHong Zhang cj += cnzi; ca += cnzi; 313519eb980SHong Zhang } 3142e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3152e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3162e5835c6SStefano Zampini #endif 317c58ca2e3SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 318c58ca2e3SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 319c58ca2e3SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 3202e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 3212e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 322c58ca2e3SHong Zhang PetscFunctionReturn(0); 323c58ca2e3SHong Zhang } 324c58ca2e3SHong Zhang 32525023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 326c58ca2e3SHong Zhang { 327c58ca2e3SHong Zhang PetscErrorCode ierr; 328c58ca2e3SHong Zhang PetscLogDouble flops=0.0; 329c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 330c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 331c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 332c58ca2e3SHong Zhang PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 333c58ca2e3SHong Zhang PetscInt am = A->rmap->N,cm=C->rmap->N; 334c58ca2e3SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 3352e5835c6SStefano Zampini PetscScalar *ca=c->a,valtmp; 3362e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 337c58ca2e3SHong Zhang PetscInt nextb; 338c58ca2e3SHong Zhang 339c58ca2e3SHong Zhang PetscFunctionBegin; 3402e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 3412e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 342cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 343cf742fccSHong Zhang ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 344cf742fccSHong Zhang c->a = ca; 345cf742fccSHong Zhang c->free_a = PETSC_TRUE; 346cf742fccSHong Zhang } 347cf742fccSHong Zhang 348c58ca2e3SHong Zhang /* clean old values in C */ 349580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 350c58ca2e3SHong Zhang /* Traverse A row-wise. */ 351c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 352c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 353519eb980SHong Zhang for (i=0; i<am; i++) { 354519eb980SHong Zhang anzi = ai[i+1] - ai[i]; 355519eb980SHong Zhang cnzi = ci[i+1] - ci[i]; 356519eb980SHong Zhang for (j=0; j<anzi; j++) { 357519eb980SHong Zhang brow = aj[j]; 358519eb980SHong Zhang bnzi = bi[brow+1] - bi[brow]; 359519eb980SHong Zhang bjj = bj + bi[brow]; 360519eb980SHong Zhang baj = ba + bi[brow]; 361519eb980SHong Zhang /* perform sparse axpy */ 36236ec6d2dSHong Zhang valtmp = aa[j]; 363c124e916SHong Zhang nextb = 0; 364c124e916SHong Zhang for (k=0; nextb<bnzi; k++) { 365c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 36636ec6d2dSHong Zhang ca[k] += valtmp*baj[nextb++]; 367c124e916SHong Zhang } 368d50806bdSBarry Smith } 369d50806bdSBarry Smith flops += 2*bnzi; 370d50806bdSBarry Smith } 371519eb980SHong Zhang aj += anzi; aa += anzi; 372519eb980SHong Zhang cj += cnzi; ca += cnzi; 373d50806bdSBarry Smith } 3742e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3752e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3762e5835c6SStefano Zampini #endif 377716bacf3SKris Buschelman ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 378716bacf3SKris Buschelman ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 379d50806bdSBarry Smith ierr = PetscLogFlops(flops);CHKERRQ(ierr); 3802e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 3812e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 382d50806bdSBarry Smith PetscFunctionReturn(0); 383d50806bdSBarry Smith } 384bc011b1eSHong Zhang 3854222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C) 38625296bd5SBarry Smith { 38725296bd5SBarry Smith PetscErrorCode ierr; 38825296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 38925296bd5SBarry Smith PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 3903c50cad2SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 39125296bd5SBarry Smith MatScalar *ca; 39225296bd5SBarry Smith PetscReal afill; 393eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 394eca6b86aSHong Zhang PetscTable ta; 3950298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 39625296bd5SBarry Smith 39725296bd5SBarry Smith PetscFunctionBegin; 3983c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 3993c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 4003c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 401854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 4023c50cad2SHong Zhang ci[0] = 0; 4033c50cad2SHong Zhang 4043c50cad2SHong Zhang /* create and initialize a linked list */ 405c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 406c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 407eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 408eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 409eca6b86aSHong Zhang 410eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 4113c50cad2SHong Zhang 4123c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 413f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 4143c50cad2SHong Zhang current_space = free_space; 4153c50cad2SHong Zhang 4163c50cad2SHong Zhang /* Determine ci and cj */ 4173c50cad2SHong Zhang for (i=0; i<am; i++) { 4183c50cad2SHong Zhang anzi = ai[i+1] - ai[i]; 4193c50cad2SHong Zhang aj = a->j + ai[i]; 4203c50cad2SHong Zhang for (j=0; j<anzi; j++) { 4213c50cad2SHong Zhang brow = aj[j]; 4223c50cad2SHong Zhang bnzj = bi[brow+1] - bi[brow]; 4233c50cad2SHong Zhang bj = b->j + bi[brow]; 4243c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 4253c50cad2SHong Zhang ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 4263c50cad2SHong Zhang } 4278c78258cSHong Zhang /* add possible missing diagonal entry */ 4288c78258cSHong Zhang if (C->force_diagonals) { 4298c78258cSHong Zhang ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr); 4308c78258cSHong Zhang } 4313c50cad2SHong Zhang cnzi = lnk[1]; 4323c50cad2SHong Zhang 4333c50cad2SHong Zhang /* If free space is not available, make more free space */ 4343c50cad2SHong Zhang /* Double the amount of total space in the list */ 4353c50cad2SHong Zhang if (current_space->local_remaining<cnzi) { 436f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 4373c50cad2SHong Zhang ndouble++; 4383c50cad2SHong Zhang } 4393c50cad2SHong Zhang 4403c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 4413c50cad2SHong Zhang ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 4422205254eSKarl Rupp 4433c50cad2SHong Zhang current_space->array += cnzi; 4443c50cad2SHong Zhang current_space->local_used += cnzi; 4453c50cad2SHong Zhang current_space->local_remaining -= cnzi; 4462205254eSKarl Rupp 4473c50cad2SHong Zhang ci[i+1] = ci[i] + cnzi; 4483c50cad2SHong Zhang } 4493c50cad2SHong Zhang 4503c50cad2SHong Zhang /* Column indices are in the list of free space */ 4513c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 4523c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 453854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 4543c50cad2SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 4553c50cad2SHong Zhang ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 45625296bd5SBarry Smith 45725296bd5SBarry Smith /* Allocate space for ca */ 458580bdb30SBarry Smith ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 45925296bd5SBarry Smith 46025296bd5SBarry Smith /* put together the new symbolic matrix */ 461e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 4624222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 46325296bd5SBarry Smith 46425296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 46525296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 4664222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 46725296bd5SBarry Smith c->free_a = PETSC_TRUE; 46825296bd5SBarry Smith c->free_ij = PETSC_TRUE; 46925296bd5SBarry Smith c->nonew = 0; 4702205254eSKarl Rupp 4714222ddf1SHong Zhang /* slower, less memory */ 4724222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 47325296bd5SBarry Smith 47425296bd5SBarry Smith /* set MatInfo */ 47525296bd5SBarry Smith afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 47625296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 47725296bd5SBarry Smith c->maxnz = ci[am]; 47825296bd5SBarry Smith c->nz = ci[am]; 4794222ddf1SHong Zhang C->info.mallocs = ndouble; 4804222ddf1SHong Zhang C->info.fill_ratio_given = fill; 4814222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 48225296bd5SBarry Smith 48325296bd5SBarry Smith #if defined(PETSC_USE_INFO) 48425296bd5SBarry Smith if (ci[am]) { 4854222ddf1SHong Zhang ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 4864222ddf1SHong Zhang ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 48725296bd5SBarry Smith } else { 4884222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 48925296bd5SBarry Smith } 49025296bd5SBarry Smith #endif 49125296bd5SBarry Smith PetscFunctionReturn(0); 49225296bd5SBarry Smith } 49325296bd5SBarry Smith 4944222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C) 495e9e4536cSHong Zhang { 496e9e4536cSHong Zhang PetscErrorCode ierr; 497e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 498bf9555e6SHong Zhang PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 49925c41797SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 500e9e4536cSHong Zhang MatScalar *ca; 501e9e4536cSHong Zhang PetscReal afill; 502eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 503eca6b86aSHong Zhang PetscTable ta; 5040298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 505e9e4536cSHong Zhang 506e9e4536cSHong Zhang PetscFunctionBegin; 507bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 508bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 509bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 510854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 511bd958071SHong Zhang ci[0] = 0; 512bd958071SHong Zhang 513bd958071SHong Zhang /* create and initialize a linked list */ 514c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 515c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 516eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 517eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 518eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 519bd958071SHong Zhang 520bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 521f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 522bd958071SHong Zhang current_space = free_space; 523bd958071SHong Zhang 524bd958071SHong Zhang /* Determine ci and cj */ 525bd958071SHong Zhang for (i=0; i<am; i++) { 526bd958071SHong Zhang anzi = ai[i+1] - ai[i]; 527bd958071SHong Zhang aj = a->j + ai[i]; 528bd958071SHong Zhang for (j=0; j<anzi; j++) { 529bd958071SHong Zhang brow = aj[j]; 530bd958071SHong Zhang bnzj = bi[brow+1] - bi[brow]; 531bd958071SHong Zhang bj = b->j + bi[brow]; 532bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 533bd958071SHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 534bd958071SHong Zhang } 5358c78258cSHong Zhang /* add possible missing diagonal entry */ 5368c78258cSHong Zhang if (C->force_diagonals) { 5378c78258cSHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr); 5388c78258cSHong Zhang } 5398c78258cSHong Zhang 540bd958071SHong Zhang cnzi = lnk[0]; 541bd958071SHong Zhang 542bd958071SHong Zhang /* If free space is not available, make more free space */ 543bd958071SHong Zhang /* Double the amount of total space in the list */ 544bd958071SHong Zhang if (current_space->local_remaining<cnzi) { 545f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 546bd958071SHong Zhang ndouble++; 547bd958071SHong Zhang } 548bd958071SHong Zhang 549bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 550bd958071SHong Zhang ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 5512205254eSKarl Rupp 552bd958071SHong Zhang current_space->array += cnzi; 553bd958071SHong Zhang current_space->local_used += cnzi; 554bd958071SHong Zhang current_space->local_remaining -= cnzi; 5552205254eSKarl Rupp 556bd958071SHong Zhang ci[i+1] = ci[i] + cnzi; 557bd958071SHong Zhang } 558bd958071SHong Zhang 559bd958071SHong Zhang /* Column indices are in the list of free space */ 560bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 561bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 562854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 563bd958071SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 564bd958071SHong Zhang ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 565e9e4536cSHong Zhang 566e9e4536cSHong Zhang /* Allocate space for ca */ 567bd958071SHong Zhang /*-----------------------*/ 568580bdb30SBarry Smith ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 569e9e4536cSHong Zhang 570e9e4536cSHong Zhang /* put together the new symbolic matrix */ 571e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 5724222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 573e9e4536cSHong Zhang 574e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 575e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5764222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 577e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 578e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 579e9e4536cSHong Zhang c->nonew = 0; 5802205254eSKarl Rupp 5814222ddf1SHong Zhang /* slower, less memory */ 5824222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 583e9e4536cSHong Zhang 584e9e4536cSHong Zhang /* set MatInfo */ 585e9e4536cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 586e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 587e9e4536cSHong Zhang c->maxnz = ci[am]; 588e9e4536cSHong Zhang c->nz = ci[am]; 5894222ddf1SHong Zhang C->info.mallocs = ndouble; 5904222ddf1SHong Zhang C->info.fill_ratio_given = fill; 5914222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 592e9e4536cSHong Zhang 593e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 594e9e4536cSHong Zhang if (ci[am]) { 5954222ddf1SHong Zhang ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 5964222ddf1SHong Zhang ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 597e9e4536cSHong Zhang } else { 5984222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 599e9e4536cSHong Zhang } 600e9e4536cSHong Zhang #endif 601e9e4536cSHong Zhang PetscFunctionReturn(0); 602e9e4536cSHong Zhang } 603e9e4536cSHong Zhang 6044222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C) 6050ced3a2bSJed Brown { 6060ced3a2bSJed Brown PetscErrorCode ierr; 6070ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 6080ced3a2bSJed Brown const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 6090ced3a2bSJed Brown PetscInt *ci,*cj,*bb; 6100ced3a2bSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 6110ced3a2bSJed Brown PetscReal afill; 6120ced3a2bSJed Brown PetscInt i,j,col,ndouble = 0; 6130298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 6140ced3a2bSJed Brown PetscHeap h; 6150ced3a2bSJed Brown 6160ced3a2bSJed Brown PetscFunctionBegin; 617cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 6180ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 6190ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 620854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 6210ced3a2bSJed Brown ci[0] = 0; 6220ced3a2bSJed Brown 6230ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 624f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 6250ced3a2bSJed Brown current_space = free_space; 6260ced3a2bSJed Brown 6270ced3a2bSJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 628785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 6290ced3a2bSJed Brown 6300ced3a2bSJed Brown /* Determine ci and cj */ 6310ced3a2bSJed Brown for (i=0; i<am; i++) { 6320ced3a2bSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6330ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6340ced3a2bSJed Brown ci[i+1] = ci[i]; 6350ced3a2bSJed Brown /* Populate the min heap */ 6360ced3a2bSJed Brown for (j=0; j<anzi; j++) { 6370ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 6380ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 6390ced3a2bSJed Brown ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 6400ced3a2bSJed Brown } 6410ced3a2bSJed Brown } 6420ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 6430ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6440ced3a2bSJed Brown while (j >= 0) { 6450ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 646f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 6470ced3a2bSJed Brown ndouble++; 6480ced3a2bSJed Brown } 6490ced3a2bSJed Brown *(current_space->array++) = col; 6500ced3a2bSJed Brown current_space->local_used++; 6510ced3a2bSJed Brown current_space->local_remaining--; 6520ced3a2bSJed Brown ci[i+1]++; 6530ced3a2bSJed Brown 6540ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 6550ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 6560ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 6570ced3a2bSJed Brown PetscInt j2,col2; 6580ced3a2bSJed Brown ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 6590ced3a2bSJed Brown if (col2 != col) break; 6600ced3a2bSJed Brown ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 6610ced3a2bSJed Brown if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 6620ced3a2bSJed Brown } 6630ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 6640ced3a2bSJed Brown ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 6650ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6660ced3a2bSJed Brown } 6670ced3a2bSJed Brown } 6680ced3a2bSJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 6690ced3a2bSJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 6700ced3a2bSJed Brown 6710ced3a2bSJed Brown /* Column indices are in the list of free space */ 6720ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 6730ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 674785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 6750ced3a2bSJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 6760ced3a2bSJed Brown 6770ced3a2bSJed Brown /* put together the new symbolic matrix */ 678e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 6794222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 6800ced3a2bSJed Brown 6810ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6820ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 6834222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 6840ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6850ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 6860ced3a2bSJed Brown c->nonew = 0; 68726fbe8dcSKarl Rupp 6884222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 6890ced3a2bSJed Brown 6900ced3a2bSJed Brown /* set MatInfo */ 6910ced3a2bSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 6920ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6930ced3a2bSJed Brown c->maxnz = ci[am]; 6940ced3a2bSJed Brown c->nz = ci[am]; 6954222ddf1SHong Zhang C->info.mallocs = ndouble; 6964222ddf1SHong Zhang C->info.fill_ratio_given = fill; 6974222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 6980ced3a2bSJed Brown 6990ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 7000ced3a2bSJed Brown if (ci[am]) { 7014222ddf1SHong Zhang ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 7024222ddf1SHong Zhang ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 7030ced3a2bSJed Brown } else { 7044222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 7050ced3a2bSJed Brown } 7060ced3a2bSJed Brown #endif 7070ced3a2bSJed Brown PetscFunctionReturn(0); 7080ced3a2bSJed Brown } 709e9e4536cSHong Zhang 7104222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C) 7118a07c6f1SJed Brown { 7128a07c6f1SJed Brown PetscErrorCode ierr; 7138a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 7148a07c6f1SJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 7158a07c6f1SJed Brown PetscInt *ci,*cj,*bb; 7168a07c6f1SJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 7178a07c6f1SJed Brown PetscReal afill; 7188a07c6f1SJed Brown PetscInt i,j,col,ndouble = 0; 7190298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 7208a07c6f1SJed Brown PetscHeap h; 7218a07c6f1SJed Brown PetscBT bt; 7228a07c6f1SJed Brown 7238a07c6f1SJed Brown PetscFunctionBegin; 724cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 7258a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 7268a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 727854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 7288a07c6f1SJed Brown ci[0] = 0; 7298a07c6f1SJed Brown 7308a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 731f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 7322205254eSKarl Rupp 7338a07c6f1SJed Brown current_space = free_space; 7348a07c6f1SJed Brown 7358a07c6f1SJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 736785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 7378a07c6f1SJed Brown ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 7388a07c6f1SJed Brown 7398a07c6f1SJed Brown /* Determine ci and cj */ 7408a07c6f1SJed Brown for (i=0; i<am; i++) { 7418a07c6f1SJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 7428a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 7438a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 7448a07c6f1SJed Brown ci[i+1] = ci[i]; 7458a07c6f1SJed Brown /* Populate the min heap */ 7468a07c6f1SJed Brown for (j=0; j<anzi; j++) { 7478a07c6f1SJed Brown PetscInt brow = acol[j]; 7488a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 7498a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7508a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7518a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 7528a07c6f1SJed Brown bb[j]++; 7538a07c6f1SJed Brown break; 7548a07c6f1SJed Brown } 7558a07c6f1SJed Brown } 7568a07c6f1SJed Brown } 7578a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 7588a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 7598a07c6f1SJed Brown while (j >= 0) { 7608a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 7610298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 762f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 7638a07c6f1SJed Brown ndouble++; 7648a07c6f1SJed Brown } 7658a07c6f1SJed Brown *(current_space->array++) = col; 7668a07c6f1SJed Brown current_space->local_used++; 7678a07c6f1SJed Brown current_space->local_remaining--; 7688a07c6f1SJed Brown ci[i+1]++; 7698a07c6f1SJed Brown 7708a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 7718a07c6f1SJed Brown for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 7728a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7738a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7748a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 7758a07c6f1SJed Brown bb[j]++; 7768a07c6f1SJed Brown break; 7778a07c6f1SJed Brown } 7788a07c6f1SJed Brown } 7798a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 7808a07c6f1SJed Brown } 7818a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7828a07c6f1SJed Brown for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 7838a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7848a07c6f1SJed Brown ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 7858a07c6f1SJed Brown } 7868a07c6f1SJed Brown } 7878a07c6f1SJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 7888a07c6f1SJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 7898a07c6f1SJed Brown ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 7908a07c6f1SJed Brown 7918a07c6f1SJed Brown /* Column indices are in the list of free space */ 7928a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7938a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 794785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 7958a07c6f1SJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 7968a07c6f1SJed Brown 7978a07c6f1SJed Brown /* put together the new symbolic matrix */ 798e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 7994222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 8008a07c6f1SJed Brown 8018a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 8028a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 8034222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 8048a07c6f1SJed Brown c->free_a = PETSC_TRUE; 8058a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 8068a07c6f1SJed Brown c->nonew = 0; 80726fbe8dcSKarl Rupp 8084222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 8098a07c6f1SJed Brown 8108a07c6f1SJed Brown /* set MatInfo */ 8118a07c6f1SJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 8128a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 8138a07c6f1SJed Brown c->maxnz = ci[am]; 8148a07c6f1SJed Brown c->nz = ci[am]; 8154222ddf1SHong Zhang C->info.mallocs = ndouble; 8164222ddf1SHong Zhang C->info.fill_ratio_given = fill; 8174222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 8188a07c6f1SJed Brown 8198a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 8208a07c6f1SJed Brown if (ci[am]) { 8214222ddf1SHong Zhang ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 8224222ddf1SHong Zhang ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 8238a07c6f1SJed Brown } else { 8244222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 8258a07c6f1SJed Brown } 8268a07c6f1SJed Brown #endif 8278a07c6f1SJed Brown PetscFunctionReturn(0); 8288a07c6f1SJed Brown } 8298a07c6f1SJed Brown 8304222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C) 831d7ed1a4dSandi selinger { 832d7ed1a4dSandi selinger PetscErrorCode ierr; 833d7ed1a4dSandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 834d7ed1a4dSandi selinger const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 835d7ed1a4dSandi selinger PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 836d7ed1a4dSandi selinger PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 837d7ed1a4dSandi selinger const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 838d7ed1a4dSandi selinger const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 839d7ed1a4dSandi selinger const PetscInt *brow_ptr[8],*brow_end[8]; 840d7ed1a4dSandi selinger PetscInt window[8]; 841d7ed1a4dSandi selinger PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 842d7ed1a4dSandi selinger PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 843d7ed1a4dSandi selinger PetscReal afill; 844f83700f2Sandi selinger PetscInt *workj_L1,*workj_L2,*workj_L3; 8457660c4dbSandi selinger PetscInt L1_nnz,L2_nnz; 846d7ed1a4dSandi selinger 847d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 848d7ed1a4dSandi selinger Because of the way virtual memory works, 849d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 850d7ed1a4dSandi selinger PetscFunctionBegin; 851d7ed1a4dSandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 852d7ed1a4dSandi selinger for (i=0; i<am; i++) { 853d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 854d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 855d7ed1a4dSandi selinger a_rownnz = 0; 856d7ed1a4dSandi selinger for (k=0; k<anzi; ++k) { 857d7ed1a4dSandi selinger a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 858d7ed1a4dSandi selinger if (a_rownnz > bn) { 859d7ed1a4dSandi selinger a_rownnz = bn; 860d7ed1a4dSandi selinger break; 861d7ed1a4dSandi selinger } 862d7ed1a4dSandi selinger } 863d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 864d7ed1a4dSandi selinger } 865d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 866d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 867f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 868f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr); 869d7ed1a4dSandi selinger 8707660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 8717660c4dbSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 872d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 873d7ed1a4dSandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 874d7ed1a4dSandi selinger 875d7ed1a4dSandi selinger ci_nnz = 0; 876d7ed1a4dSandi selinger ci[0] = 0; 877d7ed1a4dSandi selinger worki_L1[0] = 0; 878d7ed1a4dSandi selinger worki_L2[0] = 0; 879d7ed1a4dSandi selinger for (i=0; i<am; i++) { 880d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 881d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 882d7ed1a4dSandi selinger rowsleft = anzi; 883d7ed1a4dSandi selinger inputcol_L1 = acol; 884d7ed1a4dSandi selinger L2_nnz = 0; 8857660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8867660c4dbSandi selinger worki_L2[1] = 0; 88708fe1b3cSKarl Rupp outputi_nnz = 0; 888d7ed1a4dSandi selinger 889d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 890d7ed1a4dSandi selinger while (ci_nnz+a_maxrownnz > c_maxmem) { 891d7ed1a4dSandi selinger c_maxmem *= 2; 892d7ed1a4dSandi selinger ndouble++; 893d7ed1a4dSandi selinger ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 894d7ed1a4dSandi selinger } 895d7ed1a4dSandi selinger 896d7ed1a4dSandi selinger while (rowsleft) { 897d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 898d7ed1a4dSandi selinger L1_nrows = 0; 8997660c4dbSandi selinger L1_nnz = 0; 900d7ed1a4dSandi selinger inputcol = inputcol_L1; 901d7ed1a4dSandi selinger inputi = bi; 902d7ed1a4dSandi selinger inputj = bj; 903d7ed1a4dSandi selinger 904d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 905d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 906f83700f2Sandi selinger Input: inputj inputi inputcol bn 907d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 908d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 909d7ed1a4dSandi selinger window_min = bn; \ 9107660c4dbSandi selinger outputi_nnz = 0; \ 9117660c4dbSandi selinger for (k=0; k<ANNZ; ++k) { \ 912d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 913d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 914d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 915d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 916d7ed1a4dSandi selinger } \ 917d7ed1a4dSandi selinger while (window_min < bn) { \ 918d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 919d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 920d7ed1a4dSandi selinger old_window_min = window_min; \ 921d7ed1a4dSandi selinger window_min = bn; \ 922d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 923d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 924d7ed1a4dSandi selinger brow_ptr[k]++; \ 925d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 926d7ed1a4dSandi selinger } \ 927d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 928d7ed1a4dSandi selinger } \ 929d7ed1a4dSandi selinger } 930d7ed1a4dSandi selinger 931d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 932d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 933d7ed1a4dSandi selinger while (L1_rowsleft) { 9347660c4dbSandi selinger outputi_nnz = 0; 9357660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 9367660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 9377660c4dbSandi selinger 938d7ed1a4dSandi selinger switch (L1_rowsleft) { 939d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 940d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 941d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 942d7ed1a4dSandi selinger inputcol += L1_rowsleft; 943d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 944d7ed1a4dSandi selinger L1_rowsleft = 0; 945d7ed1a4dSandi selinger break; 946d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); 947d7ed1a4dSandi selinger inputcol += L1_rowsleft; 948d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 949d7ed1a4dSandi selinger L1_rowsleft = 0; 950d7ed1a4dSandi selinger break; 951d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); 952d7ed1a4dSandi selinger inputcol += L1_rowsleft; 953d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 954d7ed1a4dSandi selinger L1_rowsleft = 0; 955d7ed1a4dSandi selinger break; 956d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); 957d7ed1a4dSandi selinger inputcol += L1_rowsleft; 958d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 959d7ed1a4dSandi selinger L1_rowsleft = 0; 960d7ed1a4dSandi selinger break; 961d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); 962d7ed1a4dSandi selinger inputcol += L1_rowsleft; 963d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 964d7ed1a4dSandi selinger L1_rowsleft = 0; 965d7ed1a4dSandi selinger break; 966d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); 967d7ed1a4dSandi selinger inputcol += L1_rowsleft; 968d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 969d7ed1a4dSandi selinger L1_rowsleft = 0; 970d7ed1a4dSandi selinger break; 971d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); 972d7ed1a4dSandi selinger inputcol += L1_rowsleft; 973d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 974d7ed1a4dSandi selinger L1_rowsleft = 0; 975d7ed1a4dSandi selinger break; 976d7ed1a4dSandi selinger default: MatMatMultSymbolic_RowMergeMacro(8); 977d7ed1a4dSandi selinger inputcol += 8; 978d7ed1a4dSandi selinger rowsleft -= 8; 979d7ed1a4dSandi selinger L1_rowsleft -= 8; 980d7ed1a4dSandi selinger break; 981d7ed1a4dSandi selinger } 982d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9837660c4dbSandi selinger L1_nnz += outputi_nnz; 9847660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 985d7ed1a4dSandi selinger } 986d7ed1a4dSandi selinger 987d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 988d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 989d7ed1a4dSandi selinger if (anzi > 8) { 990d7ed1a4dSandi selinger inputi = worki_L1; 991d7ed1a4dSandi selinger inputj = workj_L1; 992d7ed1a4dSandi selinger inputcol = workcol; 993d7ed1a4dSandi selinger outputi_nnz = 0; 994d7ed1a4dSandi selinger 995d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 996d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 997d7ed1a4dSandi selinger 998d7ed1a4dSandi selinger switch (L1_nrows) { 999d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1000d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 1001d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1002d7ed1a4dSandi selinger break; 1003d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1004d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1005d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1006d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1007d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1008d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1009d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1010d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1011d7ed1a4dSandi selinger } 1012d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 1013d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 1014d7ed1a4dSandi selinger 10157660c4dbSandi selinger /************************ L E V E L 3 **********************/ 10167660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 1017d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1018d7ed1a4dSandi selinger inputi = worki_L2; 1019d7ed1a4dSandi selinger inputj = workj_L2; 1020d7ed1a4dSandi selinger inputcol = workcol; 1021d7ed1a4dSandi selinger outputi_nnz = 0; 1022f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 1023d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 1024d7ed1a4dSandi selinger switch (L2_nrows) { 1025d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1026d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 1027d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1028d7ed1a4dSandi selinger break; 1029d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1030d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1031d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1032d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1033d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1034d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1035d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1036d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1037d7ed1a4dSandi selinger } 1038d7ed1a4dSandi selinger L2_nrows = 1; 10397660c4dbSandi selinger L2_nnz = outputi_nnz; 10407660c4dbSandi selinger worki_L2[1] = outputi_nnz; 10417660c4dbSandi selinger /* Copy to workj_L2 */ 1042d7ed1a4dSandi selinger if (rowsleft) { 10437660c4dbSandi selinger for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1044d7ed1a4dSandi selinger } 1045d7ed1a4dSandi selinger } 1046d7ed1a4dSandi selinger } 1047d7ed1a4dSandi selinger } /* while (rowsleft) */ 1048d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 1049d7ed1a4dSandi selinger 1050d7ed1a4dSandi selinger /* terminate current row */ 1051d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 1052d7ed1a4dSandi selinger ci[i+1] = ci_nnz; 1053d7ed1a4dSandi selinger } 1054d7ed1a4dSandi selinger 1055d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 1056e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 10574222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1058d7ed1a4dSandi selinger 1059d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1060d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 10614222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1062d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 1063d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 1064d7ed1a4dSandi selinger c->nonew = 0; 1065d7ed1a4dSandi selinger 10664222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1067d7ed1a4dSandi selinger 1068d7ed1a4dSandi selinger /* set MatInfo */ 1069d7ed1a4dSandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1070d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 1071d7ed1a4dSandi selinger c->maxnz = ci[am]; 1072d7ed1a4dSandi selinger c->nz = ci[am]; 10734222ddf1SHong Zhang C->info.mallocs = ndouble; 10744222ddf1SHong Zhang C->info.fill_ratio_given = fill; 10754222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1076d7ed1a4dSandi selinger 1077d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1078d7ed1a4dSandi selinger if (ci[am]) { 10794222ddf1SHong Zhang ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 10804222ddf1SHong Zhang ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1081d7ed1a4dSandi selinger } else { 10824222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1083d7ed1a4dSandi selinger } 1084d7ed1a4dSandi selinger #endif 1085d7ed1a4dSandi selinger 1086d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 1087d7ed1a4dSandi selinger ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1088d7ed1a4dSandi selinger ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1089f83700f2Sandi selinger ierr = PetscFree(workj_L3);CHKERRQ(ierr); 1090d7ed1a4dSandi selinger PetscFunctionReturn(0); 1091d7ed1a4dSandi selinger } 1092d7ed1a4dSandi selinger 1093cd093f1dSJed Brown /* concatenate unique entries and then sort */ 10944222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C) 1095cd093f1dSJed Brown { 1096cd093f1dSJed Brown PetscErrorCode ierr; 1097cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1098cd093f1dSJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 10998c78258cSHong Zhang PetscInt *ci,*cj,bcol; 1100cd093f1dSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1101cd093f1dSJed Brown PetscReal afill; 1102cd093f1dSJed Brown PetscInt i,j,ndouble = 0; 1103cd093f1dSJed Brown PetscSegBuffer seg,segrow; 1104cd093f1dSJed Brown char *seen; 1105cd093f1dSJed Brown 1106cd093f1dSJed Brown PetscFunctionBegin; 1107854ce69bSBarry Smith ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1108cd093f1dSJed Brown ci[0] = 0; 1109cd093f1dSJed Brown 1110cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1111cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1112cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1113580bdb30SBarry Smith ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr); 1114cd093f1dSJed Brown 1115cd093f1dSJed Brown /* Determine ci and cj */ 1116cd093f1dSJed Brown for (i=0; i<am; i++) { 1117cd093f1dSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1118cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1119cd093f1dSJed Brown PetscInt packlen = 0,*PETSC_RESTRICT crow; 11208c78258cSHong Zhang 1121cd093f1dSJed Brown /* Pack segrow */ 1122cd093f1dSJed Brown for (j=0; j<anzi; j++) { 1123cd093f1dSJed Brown PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1124cd093f1dSJed Brown for (k=bjstart; k<bjend; k++) { 11258c78258cSHong Zhang bcol = bj[k]; 1126cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1127cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 1128cd093f1dSJed Brown ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1129cd093f1dSJed Brown *slot = bcol; 1130cd093f1dSJed Brown seen[bcol] = 1; 1131cd093f1dSJed Brown packlen++; 1132cd093f1dSJed Brown } 1133cd093f1dSJed Brown } 1134cd093f1dSJed Brown } 11358c78258cSHong Zhang 11368c78258cSHong Zhang /* Check i-th diagonal entry */ 11378c78258cSHong Zhang if (C->force_diagonals && !seen[i]) { 11388c78258cSHong Zhang PetscInt *PETSC_RESTRICT slot; 11398c78258cSHong Zhang ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 11408c78258cSHong Zhang *slot = i; 11418c78258cSHong Zhang seen[i] = 1; 11428c78258cSHong Zhang packlen++; 11438c78258cSHong Zhang } 11448c78258cSHong Zhang 1145cd093f1dSJed Brown ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1146cd093f1dSJed Brown ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1147cd093f1dSJed Brown ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1148cd093f1dSJed Brown ci[i+1] = ci[i] + packlen; 1149cd093f1dSJed Brown for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1150cd093f1dSJed Brown } 1151cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1152cd093f1dSJed Brown ierr = PetscFree(seen);CHKERRQ(ierr); 1153cd093f1dSJed Brown 1154cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 1155cd093f1dSJed Brown ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1156cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1157cd093f1dSJed Brown 1158cd093f1dSJed Brown /* put together the new symbolic matrix */ 1159e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 11604222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1161cd093f1dSJed Brown 1162cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1163cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 11644222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1165cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1166cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1167cd093f1dSJed Brown c->nonew = 0; 1168cd093f1dSJed Brown 11694222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1170cd093f1dSJed Brown 1171cd093f1dSJed Brown /* set MatInfo */ 11722a09556fSStefano Zampini afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5; 1173cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 1174cd093f1dSJed Brown c->maxnz = ci[am]; 1175cd093f1dSJed Brown c->nz = ci[am]; 11764222ddf1SHong Zhang C->info.mallocs = ndouble; 11774222ddf1SHong Zhang C->info.fill_ratio_given = fill; 11784222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1179cd093f1dSJed Brown 1180cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1181cd093f1dSJed Brown if (ci[am]) { 11824222ddf1SHong Zhang ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 11834222ddf1SHong Zhang ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1184cd093f1dSJed Brown } else { 11854222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1186cd093f1dSJed Brown } 1187cd093f1dSJed Brown #endif 1188cd093f1dSJed Brown PetscFunctionReturn(0); 1189cd093f1dSJed Brown } 1190cd093f1dSJed Brown 11916718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 11922128a86cSHong Zhang { 11932128a86cSHong Zhang PetscErrorCode ierr; 11946718818eSStefano Zampini Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data; 11952128a86cSHong Zhang 11962128a86cSHong Zhang PetscFunctionBegin; 119740192850SHong Zhang ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 119840192850SHong Zhang ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 119940192850SHong Zhang ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 120040192850SHong Zhang ierr = PetscFree(abt);CHKERRQ(ierr); 12012128a86cSHong Zhang PetscFunctionReturn(0); 12022128a86cSHong Zhang } 12032128a86cSHong Zhang 12044222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1205bc011b1eSHong Zhang { 12065df89d91SHong Zhang PetscErrorCode ierr; 120781d82fe4SHong Zhang Mat Bt; 120881d82fe4SHong Zhang PetscInt *bti,*btj; 120940192850SHong Zhang Mat_MatMatTransMult *abt; 12104222ddf1SHong Zhang Mat_Product *product = C->product; 12116718818eSStefano Zampini char *alg; 1212d2853540SHong Zhang 121381d82fe4SHong Zhang PetscFunctionBegin; 12146718818eSStefano Zampini if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 12156718818eSStefano Zampini if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 12166718818eSStefano Zampini 121781d82fe4SHong Zhang /* create symbolic Bt */ 121881d82fe4SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12190298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 122033d57670SJed Brown ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 122104b858e0SBarry Smith ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 122281d82fe4SHong Zhang 122381d82fe4SHong Zhang /* get symbolic C=A*Bt */ 12246718818eSStefano Zampini ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr); 12254222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */ 122681d82fe4SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 12274222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */ 12286718818eSStefano Zampini ierr = PetscFree(alg);CHKERRQ(ierr); 122981d82fe4SHong Zhang 1230*a5b23f4aSJose E. Roman /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1231b00a9115SJed Brown ierr = PetscNew(&abt);CHKERRQ(ierr); 12322128a86cSHong Zhang 12336718818eSStefano Zampini product->data = abt; 12346718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 12356718818eSStefano Zampini 12364222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 12372128a86cSHong Zhang 12384222ddf1SHong Zhang abt->usecoloring = PETSC_FALSE; 12394222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr); 124040192850SHong Zhang if (abt->usecoloring) { 1241b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1242b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1243335efc43SPeter Brune MatColoring coloring; 12442128a86cSHong Zhang ISColoring iscoloring; 12452128a86cSHong Zhang Mat Bt_dense,C_dense; 1246e8354b3eSHong Zhang 12474222ddf1SHong Zhang /* inode causes memory problem */ 12484222ddf1SHong Zhang ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr); 12494222ddf1SHong Zhang 12504222ddf1SHong Zhang ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr); 1251335efc43SPeter Brune ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1252335efc43SPeter Brune ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1253335efc43SPeter Brune ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1254335efc43SPeter Brune ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1255335efc43SPeter Brune ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 12564222ddf1SHong Zhang ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr); 12572205254eSKarl Rupp 125840192850SHong Zhang abt->matcoloring = matcoloring; 12592205254eSKarl Rupp 12602128a86cSHong Zhang ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 12612128a86cSHong Zhang 12622128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 12632128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 12642128a86cSHong Zhang ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12652128a86cSHong Zhang ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 12660298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 12672205254eSKarl Rupp 12682128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 126940192850SHong Zhang abt->Bt_den = Bt_dense; 12702128a86cSHong Zhang 12712128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 12722128a86cSHong Zhang ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12732128a86cSHong Zhang ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 12740298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 12752205254eSKarl Rupp 12762128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 127740192850SHong Zhang abt->ABt_den = C_dense; 1278f94ccd6cSHong Zhang 1279f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12801ce71dffSSatish Balay { 12814222ddf1SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 12824222ddf1SHong Zhang ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr); 12831ce71dffSSatish Balay } 1284f94ccd6cSHong Zhang #endif 12852128a86cSHong Zhang } 1286e99be685SHong Zhang /* clean up */ 1287e99be685SHong Zhang ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1288e99be685SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12895df89d91SHong Zhang PetscFunctionReturn(0); 12905df89d91SHong Zhang } 12915df89d91SHong Zhang 12926fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 12935df89d91SHong Zhang { 12945df89d91SHong Zhang PetscErrorCode ierr; 12955df89d91SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1296e2cac8caSJed Brown PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 129781d82fe4SHong Zhang PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 12985df89d91SHong Zhang PetscLogDouble flops=0.0; 1299aa1aec99SHong Zhang MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 13006718818eSStefano Zampini Mat_MatMatTransMult *abt; 13016718818eSStefano Zampini Mat_Product *product = C->product; 13025df89d91SHong Zhang 13035df89d91SHong Zhang PetscFunctionBegin; 13046718818eSStefano Zampini if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 13056718818eSStefano Zampini abt = (Mat_MatMatTransMult *)product->data; 13066718818eSStefano Zampini if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 130758ed3ceaSHong Zhang /* clear old values in C */ 130858ed3ceaSHong Zhang if (!c->a) { 1309580bdb30SBarry Smith ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 131058ed3ceaSHong Zhang c->a = ca; 131158ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 131258ed3ceaSHong Zhang } else { 131358ed3ceaSHong Zhang ca = c->a; 1314580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr); 131558ed3ceaSHong Zhang } 131658ed3ceaSHong Zhang 131740192850SHong Zhang if (abt->usecoloring) { 131840192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 131940192850SHong Zhang Mat Bt_dense,C_dense = abt->ABt_den; 1320c8db22f6SHong Zhang 1321b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 132240192850SHong Zhang Bt_dense = abt->Bt_den; 1323b9af6bddSHong Zhang ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1324c8db22f6SHong Zhang 1325c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 13262128a86cSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1327c8db22f6SHong Zhang 13282128a86cSHong Zhang /* Recover C from C_dense */ 1329b9af6bddSHong Zhang ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1330c8db22f6SHong Zhang PetscFunctionReturn(0); 1331c8db22f6SHong Zhang } 1332c8db22f6SHong Zhang 13331fa1209cSHong Zhang for (i=0; i<cm; i++) { 133481d82fe4SHong Zhang anzi = ai[i+1] - ai[i]; 133581d82fe4SHong Zhang acol = aj + ai[i]; 13366973769bSHong Zhang aval = aa + ai[i]; 13371fa1209cSHong Zhang cnzi = ci[i+1] - ci[i]; 13381fa1209cSHong Zhang ccol = cj + ci[i]; 13396973769bSHong Zhang cval = ca + ci[i]; 13401fa1209cSHong Zhang for (j=0; j<cnzi; j++) { 134181d82fe4SHong Zhang brow = ccol[j]; 134281d82fe4SHong Zhang bnzj = bi[brow+1] - bi[brow]; 134381d82fe4SHong Zhang bcol = bj + bi[brow]; 13446973769bSHong Zhang bval = ba + bi[brow]; 13456973769bSHong Zhang 134681d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 134781d82fe4SHong Zhang nexta = 0; nextb = 0; 134881d82fe4SHong Zhang while (nexta<anzi && nextb<bnzj) { 13497b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 135081d82fe4SHong Zhang if (nexta == anzi) break; 13517b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 135281d82fe4SHong Zhang if (nextb == bnzj) break; 135381d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 13546973769bSHong Zhang cval[j] += aval[nexta]*bval[nextb]; 135581d82fe4SHong Zhang nexta++; nextb++; 135681d82fe4SHong Zhang flops += 2; 13571fa1209cSHong Zhang } 13581fa1209cSHong Zhang } 135981d82fe4SHong Zhang } 136081d82fe4SHong Zhang } 136181d82fe4SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 136281d82fe4SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 136381d82fe4SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 13645df89d91SHong Zhang PetscFunctionReturn(0); 13655df89d91SHong Zhang } 13665df89d91SHong Zhang 13676718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 13686d373c3eSHong Zhang { 13696d373c3eSHong Zhang PetscErrorCode ierr; 13706718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data; 13716d373c3eSHong Zhang 13726d373c3eSHong Zhang PetscFunctionBegin; 13736d373c3eSHong Zhang ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 13746718818eSStefano Zampini if (atb->destroy) { 13756718818eSStefano Zampini ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr); 13766473ade5SStefano Zampini } 13776d373c3eSHong Zhang ierr = PetscFree(atb);CHKERRQ(ierr); 13786d373c3eSHong Zhang PetscFunctionReturn(0); 13796d373c3eSHong Zhang } 13806d373c3eSHong Zhang 13814222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 13825df89d91SHong Zhang { 1383bc011b1eSHong Zhang PetscErrorCode ierr; 1384089a957eSStefano Zampini Mat At = NULL; 138538baddfdSBarry Smith PetscInt *ati,*atj; 13864222ddf1SHong Zhang Mat_Product *product = C->product; 1387089a957eSStefano Zampini PetscBool flg,def,square; 1388bc011b1eSHong Zhang 1389bc011b1eSHong Zhang PetscFunctionBegin; 1390089a957eSStefano Zampini MatCheckProduct(C,4); 1391089a957eSStefano Zampini square = (PetscBool)(A == B && A->symmetric && A->symmetric_set); 13924222ddf1SHong Zhang /* outerproduct */ 1393089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr); 13944222ddf1SHong Zhang if (flg) { 1395bc011b1eSHong Zhang /* create symbolic At */ 1396089a957eSStefano Zampini if (!square) { 1397bc011b1eSHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 13980298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 139933d57670SJed Brown ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 140004b858e0SBarry Smith ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1401089a957eSStefano Zampini } 1402bc011b1eSHong Zhang /* get symbolic C=At*B */ 14037a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1404089a957eSStefano Zampini ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 1405bc011b1eSHong Zhang 1406bc011b1eSHong Zhang /* clean up */ 1407089a957eSStefano Zampini if (!square) { 14086bf464f9SBarry Smith ierr = MatDestroy(&At);CHKERRQ(ierr); 1409bc011b1eSHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 1410089a957eSStefano Zampini } 14116d373c3eSHong Zhang 14124222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 14137a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr); 14144222ddf1SHong Zhang PetscFunctionReturn(0); 14154222ddf1SHong Zhang } 14164222ddf1SHong Zhang 14174222ddf1SHong Zhang /* matmatmult */ 1418089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr); 1419089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr); 1420089a957eSStefano Zampini if (flg || def) { 14214222ddf1SHong Zhang Mat_MatTransMatMult *atb; 14224222ddf1SHong Zhang 14236718818eSStefano Zampini if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 14244222ddf1SHong Zhang ierr = PetscNew(&atb);CHKERRQ(ierr); 1425089a957eSStefano Zampini if (!square) { 14264222ddf1SHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 1427089a957eSStefano Zampini } 14287a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1429089a957eSStefano Zampini ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 14306718818eSStefano Zampini ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr); 14316718818eSStefano Zampini product->data = atb; 14326718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 14334222ddf1SHong Zhang atb->At = At; 14344222ddf1SHong Zhang atb->updateAt = PETSC_FALSE; /* because At is computed here */ 14354222ddf1SHong Zhang 14364222ddf1SHong Zhang C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 14374222ddf1SHong Zhang PetscFunctionReturn(0); 14384222ddf1SHong Zhang } 14394222ddf1SHong Zhang 14404222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1441bc011b1eSHong Zhang } 1442bc011b1eSHong Zhang 144375648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1444bc011b1eSHong Zhang { 1445bc011b1eSHong Zhang PetscErrorCode ierr; 14460fbc74f4SHong Zhang Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1447d0f46423SBarry Smith PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1448d0f46423SBarry Smith PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1449d13dce4bSSatish Balay PetscLogDouble flops=0.0; 1450aa1aec99SHong Zhang MatScalar *aa=a->a,*ba,*ca,*caj; 1451bc011b1eSHong Zhang 1452bc011b1eSHong Zhang PetscFunctionBegin; 1453aa1aec99SHong Zhang if (!c->a) { 1454580bdb30SBarry Smith ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 14552205254eSKarl Rupp 1456aa1aec99SHong Zhang c->a = ca; 1457aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1458aa1aec99SHong Zhang } else { 1459aa1aec99SHong Zhang ca = c->a; 1460580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 1461aa1aec99SHong Zhang } 1462bc011b1eSHong Zhang 1463bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1464bc011b1eSHong Zhang for (i=0; i<am; i++) { 1465bc011b1eSHong Zhang bj = b->j + bi[i]; 1466bc011b1eSHong Zhang ba = b->a + bi[i]; 1467bc011b1eSHong Zhang bnzi = bi[i+1] - bi[i]; 1468bc011b1eSHong Zhang anzi = ai[i+1] - ai[i]; 1469bc011b1eSHong Zhang for (j=0; j<anzi; j++) { 1470bc011b1eSHong Zhang nextb = 0; 14710fbc74f4SHong Zhang crow = *aj++; 1472bc011b1eSHong Zhang cjj = cj + ci[crow]; 1473bc011b1eSHong Zhang caj = ca + ci[crow]; 1474bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1475bc011b1eSHong Zhang for (k=0; nextb<bnzi; k++) { 14760fbc74f4SHong Zhang if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 14770fbc74f4SHong Zhang caj[k] += (*aa)*(*(ba+nextb)); 1478bc011b1eSHong Zhang nextb++; 1479bc011b1eSHong Zhang } 1480bc011b1eSHong Zhang } 1481bc011b1eSHong Zhang flops += 2*bnzi; 14820fbc74f4SHong Zhang aa++; 1483bc011b1eSHong Zhang } 1484bc011b1eSHong Zhang } 1485bc011b1eSHong Zhang 1486bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 1487bc011b1eSHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1488bc011b1eSHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1489bc011b1eSHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1490bc011b1eSHong Zhang PetscFunctionReturn(0); 1491bc011b1eSHong Zhang } 14929123193aSHong Zhang 14934222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 14949123193aSHong Zhang { 14959123193aSHong Zhang PetscErrorCode ierr; 14969123193aSHong Zhang 14979123193aSHong Zhang PetscFunctionBegin; 14985a586d82SBarry Smith ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 14994222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 15009123193aSHong Zhang PetscFunctionReturn(0); 15019123193aSHong Zhang } 15029123193aSHong Zhang 150393aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add) 15049123193aSHong Zhang { 1505f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1506612438f1SStefano Zampini Mat_SeqDense *bd=(Mat_SeqDense*)B->data; 15071ca9667aSStefano Zampini Mat_SeqDense *cd=(Mat_SeqDense*)C->data; 15089123193aSHong Zhang PetscErrorCode ierr; 15091ca9667aSStefano Zampini PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4; 1510a4af7ca8SStefano Zampini const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av; 1511daf57211SHong Zhang const PetscInt *aj; 1512612438f1SStefano Zampini PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 15131ca9667aSStefano Zampini PetscInt clda=cd->lda; 15141ca9667aSStefano Zampini PetscInt am4=4*clda,bm4=4*bm,col,i,j,n; 15159123193aSHong Zhang 15169123193aSHong Zhang PetscFunctionBegin; 1517f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 1518a4af7ca8SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr); 151993aa15f2SStefano Zampini if (add) { 15208c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 152193aa15f2SStefano Zampini } else { 152293aa15f2SStefano Zampini ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr); 152393aa15f2SStefano Zampini } 1524a4af7ca8SStefano Zampini ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1525f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 15261ca9667aSStefano Zampini c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda; 15271ca9667aSStefano Zampini for (col=0; col<(cn/4)*4; col += 4) { /* over columns of C */ 15281ca9667aSStefano Zampini for (i=0; i<am; i++) { /* over rows of A in those columns */ 1529f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1530f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1531f32d5d43SBarry Smith aj = a->j + a->i[i]; 1532a4af7ca8SStefano Zampini aa = av + a->i[i]; 1533f32d5d43SBarry Smith for (j=0; j<n; j++) { 15341ca9667aSStefano Zampini const PetscScalar aatmp = aa[j]; 15351ca9667aSStefano Zampini const PetscInt ajtmp = aj[j]; 1536730858b9SHong Zhang r1 += aatmp*b1[ajtmp]; 1537730858b9SHong Zhang r2 += aatmp*b2[ajtmp]; 1538730858b9SHong Zhang r3 += aatmp*b3[ajtmp]; 1539730858b9SHong Zhang r4 += aatmp*b4[ajtmp]; 15409123193aSHong Zhang } 154193aa15f2SStefano Zampini if (add) { 154287753ddeSHong Zhang c1[i] += r1; 154387753ddeSHong Zhang c2[i] += r2; 154487753ddeSHong Zhang c3[i] += r3; 154587753ddeSHong Zhang c4[i] += r4; 154693aa15f2SStefano Zampini } else { 154793aa15f2SStefano Zampini c1[i] = r1; 154893aa15f2SStefano Zampini c2[i] = r2; 154993aa15f2SStefano Zampini c3[i] = r3; 155093aa15f2SStefano Zampini c4[i] = r4; 155193aa15f2SStefano Zampini } 1552f32d5d43SBarry Smith } 1553730858b9SHong Zhang b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1554730858b9SHong Zhang c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1555f32d5d43SBarry Smith } 155693aa15f2SStefano Zampini /* process remaining columns */ 155793aa15f2SStefano Zampini if (col != cn) { 155893aa15f2SStefano Zampini PetscInt rc = cn-col; 155993aa15f2SStefano Zampini 156093aa15f2SStefano Zampini if (rc == 1) { 156193aa15f2SStefano Zampini for (i=0; i<am; i++) { 1562f32d5d43SBarry Smith r1 = 0.0; 1563f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1564f32d5d43SBarry Smith aj = a->j + a->i[i]; 1565a4af7ca8SStefano Zampini aa = av + a->i[i]; 156693aa15f2SStefano Zampini for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]]; 156793aa15f2SStefano Zampini if (add) c1[i] += r1; 156893aa15f2SStefano Zampini else c1[i] = r1; 156993aa15f2SStefano Zampini } 157093aa15f2SStefano Zampini } else if (rc == 2) { 157193aa15f2SStefano Zampini for (i=0; i<am; i++) { 157293aa15f2SStefano Zampini r1 = r2 = 0.0; 157393aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 157493aa15f2SStefano Zampini aj = a->j + a->i[i]; 157593aa15f2SStefano Zampini aa = av + a->i[i]; 1576f32d5d43SBarry Smith for (j=0; j<n; j++) { 157793aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 157893aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 157993aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 158093aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 1581f32d5d43SBarry Smith } 158293aa15f2SStefano Zampini if (add) { 158387753ddeSHong Zhang c1[i] += r1; 158493aa15f2SStefano Zampini c2[i] += r2; 158593aa15f2SStefano Zampini } else { 158693aa15f2SStefano Zampini c1[i] = r1; 158793aa15f2SStefano Zampini c2[i] = r2; 1588f32d5d43SBarry Smith } 158993aa15f2SStefano Zampini } 159093aa15f2SStefano Zampini } else { 159193aa15f2SStefano Zampini for (i=0; i<am; i++) { 159293aa15f2SStefano Zampini r1 = r2 = r3 = 0.0; 159393aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 159493aa15f2SStefano Zampini aj = a->j + a->i[i]; 159593aa15f2SStefano Zampini aa = av + a->i[i]; 159693aa15f2SStefano Zampini for (j=0; j<n; j++) { 159793aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 159893aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 159993aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 160093aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 160193aa15f2SStefano Zampini r3 += aatmp*b3[ajtmp]; 160293aa15f2SStefano Zampini } 160393aa15f2SStefano Zampini if (add) { 160493aa15f2SStefano Zampini c1[i] += r1; 160593aa15f2SStefano Zampini c2[i] += r2; 160693aa15f2SStefano Zampini c3[i] += r3; 160793aa15f2SStefano Zampini } else { 160893aa15f2SStefano Zampini c1[i] = r1; 160993aa15f2SStefano Zampini c2[i] = r2; 161093aa15f2SStefano Zampini c3[i] = r3; 161193aa15f2SStefano Zampini } 161293aa15f2SStefano Zampini } 161393aa15f2SStefano Zampini } 1614f32d5d43SBarry Smith } 1615dc0b31edSSatish Balay ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 161693aa15f2SStefano Zampini if (add) { 16178c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 161893aa15f2SStefano Zampini } else { 161993aa15f2SStefano Zampini ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr); 162093aa15f2SStefano Zampini } 1621a4af7ca8SStefano Zampini ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 1622a4af7ca8SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr); 1623f32d5d43SBarry Smith PetscFunctionReturn(0); 1624f32d5d43SBarry Smith } 1625f32d5d43SBarry Smith 162687753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1627f32d5d43SBarry Smith { 1628f32d5d43SBarry Smith PetscErrorCode ierr; 1629f32d5d43SBarry Smith 1630f32d5d43SBarry Smith PetscFunctionBegin; 163187753ddeSHong Zhang if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 163287753ddeSHong Zhang if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 163387753ddeSHong Zhang if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 16344324174eSBarry Smith 163593aa15f2SStefano Zampini ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr); 16369123193aSHong Zhang PetscFunctionReturn(0); 16379123193aSHong Zhang } 1638b1683b59SHong Zhang 16394222ddf1SHong Zhang /* ------------------------------------------------------- */ 16404222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 16414222ddf1SHong Zhang { 16424222ddf1SHong Zhang PetscFunctionBegin; 16434222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 16444222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16454222ddf1SHong Zhang PetscFunctionReturn(0); 16464222ddf1SHong Zhang } 16474222ddf1SHong Zhang 16486718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat); 16496718818eSStefano Zampini 16504222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 16514222ddf1SHong Zhang { 16524222ddf1SHong Zhang PetscFunctionBegin; 16536718818eSStefano Zampini C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16544222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB; 16556718818eSStefano Zampini PetscFunctionReturn(0); 16566718818eSStefano Zampini } 16576718818eSStefano Zampini 16586718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 16596718818eSStefano Zampini { 16606718818eSStefano Zampini PetscFunctionBegin; 16616718818eSStefano Zampini C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16626718818eSStefano Zampini C->ops->productsymbolic = MatProductSymbolic_ABt; 16634222ddf1SHong Zhang PetscFunctionReturn(0); 16644222ddf1SHong Zhang } 16654222ddf1SHong Zhang 16664222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 16674222ddf1SHong Zhang { 16684222ddf1SHong Zhang PetscErrorCode ierr; 16694222ddf1SHong Zhang Mat_Product *product = C->product; 16704222ddf1SHong Zhang 16714222ddf1SHong Zhang PetscFunctionBegin; 16724222ddf1SHong Zhang switch (product->type) { 16734222ddf1SHong Zhang case MATPRODUCT_AB: 16744222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr); 16754222ddf1SHong Zhang break; 16764222ddf1SHong Zhang case MATPRODUCT_AtB: 16774222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr); 16784222ddf1SHong Zhang break; 16796718818eSStefano Zampini case MATPRODUCT_ABt: 16806718818eSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr); 16814222ddf1SHong Zhang break; 16824222ddf1SHong Zhang default: 16836718818eSStefano Zampini break; 16844222ddf1SHong Zhang } 16854222ddf1SHong Zhang PetscFunctionReturn(0); 16864222ddf1SHong Zhang } 16874222ddf1SHong Zhang /* ------------------------------------------------------- */ 16884222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 16894222ddf1SHong Zhang { 16904222ddf1SHong Zhang PetscErrorCode ierr; 16914222ddf1SHong Zhang Mat_Product *product = C->product; 16924222ddf1SHong Zhang Mat A = product->A; 16934222ddf1SHong Zhang PetscBool baij; 16944222ddf1SHong Zhang 16954222ddf1SHong Zhang PetscFunctionBegin; 16964222ddf1SHong Zhang ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr); 16974222ddf1SHong Zhang if (!baij) { /* A is seqsbaij */ 16984222ddf1SHong Zhang PetscBool sbaij; 16994222ddf1SHong Zhang ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr); 17004222ddf1SHong Zhang if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format"); 17014222ddf1SHong Zhang 17024222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 17034222ddf1SHong Zhang } else { /* A is seqbaij */ 17044222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 17054222ddf1SHong Zhang } 17064222ddf1SHong Zhang 17074222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17084222ddf1SHong Zhang PetscFunctionReturn(0); 17094222ddf1SHong Zhang } 17104222ddf1SHong Zhang 17114222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 17124222ddf1SHong Zhang { 17134222ddf1SHong Zhang PetscErrorCode ierr; 17144222ddf1SHong Zhang Mat_Product *product = C->product; 17154222ddf1SHong Zhang 17164222ddf1SHong Zhang PetscFunctionBegin; 17176718818eSStefano Zampini MatCheckProduct(C,1); 17186718818eSStefano Zampini if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 17196718818eSStefano Zampini if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) { 17204222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr); 17216718818eSStefano Zampini } 17224222ddf1SHong Zhang PetscFunctionReturn(0); 17234222ddf1SHong Zhang } 17246718818eSStefano Zampini 17254222ddf1SHong Zhang /* ------------------------------------------------------- */ 17264222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 17274222ddf1SHong Zhang { 17284222ddf1SHong Zhang PetscFunctionBegin; 17294222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 17304222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17314222ddf1SHong Zhang PetscFunctionReturn(0); 17324222ddf1SHong Zhang } 17334222ddf1SHong Zhang 17344222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 17354222ddf1SHong Zhang { 17364222ddf1SHong Zhang PetscErrorCode ierr; 17374222ddf1SHong Zhang Mat_Product *product = C->product; 17384222ddf1SHong Zhang 17394222ddf1SHong Zhang PetscFunctionBegin; 17404222ddf1SHong Zhang if (product->type == MATPRODUCT_AB) { 17414222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr); 17426718818eSStefano Zampini } 17434222ddf1SHong Zhang PetscFunctionReturn(0); 17444222ddf1SHong Zhang } 17454222ddf1SHong Zhang /* ------------------------------------------------------- */ 17464222ddf1SHong Zhang 1747b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1748c8db22f6SHong Zhang { 1749c8db22f6SHong Zhang PetscErrorCode ierr; 17502f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 17512f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 17522f5f1f90SHong Zhang PetscInt *bi = b->i,*bj=b->j; 17532f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 17542f5f1f90SHong Zhang MatScalar *btval,*btval_den,*ba=b->a; 17552f5f1f90SHong Zhang PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1756c8db22f6SHong Zhang 1757c8db22f6SHong Zhang PetscFunctionBegin; 17582f5f1f90SHong Zhang btval_den=btdense->v; 1759580bdb30SBarry Smith ierr = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr); 17602f5f1f90SHong Zhang for (k=0; k<ncolors; k++) { 17612f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 17622f5f1f90SHong Zhang for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1763d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 17642f5f1f90SHong Zhang btcol = bj + bi[col]; 17652f5f1f90SHong Zhang btval = ba + bi[col]; 17662f5f1f90SHong Zhang anz = bi[col+1] - bi[col]; 1767d2853540SHong Zhang for (j=0; j<anz; j++) { 17682f5f1f90SHong Zhang brow = btcol[j]; 17692f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1770c8db22f6SHong Zhang } 1771c8db22f6SHong Zhang } 17722f5f1f90SHong Zhang btval_den += m; 1773c8db22f6SHong Zhang } 1774c8db22f6SHong Zhang PetscFunctionReturn(0); 1775c8db22f6SHong Zhang } 1776c8db22f6SHong Zhang 1777b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 17788972f759SHong Zhang { 17798972f759SHong Zhang PetscErrorCode ierr; 1780b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 17811683a169SBarry Smith const PetscScalar *ca_den,*ca_den_ptr; 17821683a169SBarry Smith PetscScalar *ca=csp->a; 1783f99a636bSHong Zhang PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1784e88777f2SHong Zhang PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1785077f23c2SHong Zhang PetscInt nrows,*row,*idx; 1786077f23c2SHong Zhang PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 17878972f759SHong Zhang 17888972f759SHong Zhang PetscFunctionBegin; 17891683a169SBarry Smith ierr = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1790f99a636bSHong Zhang 1791077f23c2SHong Zhang if (brows > 0) { 1792077f23c2SHong Zhang PetscInt *lstart,row_end,row_start; 1793980ae229SHong Zhang lstart = matcoloring->lstart; 1794580bdb30SBarry Smith ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr); 1795eeb4fd02SHong Zhang 1796077f23c2SHong Zhang row_end = brows; 1797eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1798077f23c2SHong Zhang for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1799077f23c2SHong Zhang ca_den_ptr = ca_den; 1800980ae229SHong Zhang for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1801eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1802eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1803eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1804eeb4fd02SHong Zhang for (l=lstart[k]; l<nrows; l++) { 1805eeb4fd02SHong Zhang if (row[l] >= row_end) { 1806eeb4fd02SHong Zhang lstart[k] = l; 1807eeb4fd02SHong Zhang break; 1808eeb4fd02SHong Zhang } else { 1809077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1810eeb4fd02SHong Zhang } 1811eeb4fd02SHong Zhang } 1812077f23c2SHong Zhang ca_den_ptr += m; 1813eeb4fd02SHong Zhang } 1814077f23c2SHong Zhang row_end += brows; 1815eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1816eeb4fd02SHong Zhang } 1817077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1818077f23c2SHong Zhang ca_den_ptr = ca_den; 1819077f23c2SHong Zhang for (k=0; k<ncolors; k++) { 1820077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1821077f23c2SHong Zhang row = rows + colorforrow[k]; 1822077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1823077f23c2SHong Zhang for (l=0; l<nrows; l++) { 1824077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1825077f23c2SHong Zhang } 1826077f23c2SHong Zhang ca_den_ptr += m; 1827077f23c2SHong Zhang } 1828f99a636bSHong Zhang } 1829f99a636bSHong Zhang 18301683a169SBarry Smith ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1831f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1832077f23c2SHong Zhang if (matcoloring->brows > 0) { 1833f99a636bSHong Zhang ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr); 1834e88777f2SHong Zhang } else { 1835077f23c2SHong Zhang ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1836e88777f2SHong Zhang } 1837f99a636bSHong Zhang #endif 18388972f759SHong Zhang PetscFunctionReturn(0); 18398972f759SHong Zhang } 18408972f759SHong Zhang 1841b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1842b1683b59SHong Zhang { 1843b1683b59SHong Zhang PetscErrorCode ierr; 1844e88777f2SHong Zhang PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 18451a83f524SJed Brown const PetscInt *is,*ci,*cj,*row_idx; 1846b28d80bdSHong Zhang PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1847b1683b59SHong Zhang IS *isa; 1848b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1849e88777f2SHong Zhang PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1850e88777f2SHong Zhang PetscInt *colorforcol,*columns,*columns_i,brows; 1851e88777f2SHong Zhang PetscBool flg; 1852b1683b59SHong Zhang 1853b1683b59SHong Zhang PetscFunctionBegin; 1854071fcb05SBarry Smith ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1855e99be685SHong Zhang 18567ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1857e88777f2SHong Zhang Nbs = mat->cmap->N/bs; 1858b1683b59SHong Zhang c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1859e88777f2SHong Zhang c->N = Nbs; 1860e88777f2SHong Zhang c->m = c->M; 1861b1683b59SHong Zhang c->rstart = 0; 1862e88777f2SHong Zhang c->brows = 100; 1863b1683b59SHong Zhang 1864b1683b59SHong Zhang c->ncolors = nis; 1865dcca6d9dSJed Brown ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1866854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1867854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1868e88777f2SHong Zhang 1869e88777f2SHong Zhang brows = c->brows; 1870c5929fdfSBarry Smith ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1871e88777f2SHong Zhang if (flg) c->brows = brows; 1872eeb4fd02SHong Zhang if (brows > 0) { 1873854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1874e88777f2SHong Zhang } 18752205254eSKarl Rupp 1876d2853540SHong Zhang colorforrow[0] = 0; 1877d2853540SHong Zhang rows_i = rows; 1878f99a636bSHong Zhang den2sp_i = den2sp; 1879b1683b59SHong Zhang 1880854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1881854ce69bSBarry Smith ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 18822205254eSKarl Rupp 1883d2853540SHong Zhang colorforcol[0] = 0; 1884d2853540SHong Zhang columns_i = columns; 1885d2853540SHong Zhang 1886077f23c2SHong Zhang /* get column-wise storage of mat */ 1887077f23c2SHong Zhang ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1888b1683b59SHong Zhang 1889b28d80bdSHong Zhang cm = c->m; 1890854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1891854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1892f99a636bSHong Zhang for (i=0; i<nis; i++) { /* loop over color */ 1893b1683b59SHong Zhang ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1894b1683b59SHong Zhang ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 18952205254eSKarl Rupp 1896b1683b59SHong Zhang c->ncolumns[i] = n; 1897b1683b59SHong Zhang if (n) { 1898580bdb30SBarry Smith ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr); 1899b1683b59SHong Zhang } 1900d2853540SHong Zhang colorforcol[i+1] = colorforcol[i] + n; 1901d2853540SHong Zhang columns_i += n; 1902b1683b59SHong Zhang 1903b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 1904580bdb30SBarry Smith ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr); 1905e99be685SHong Zhang 1906b7caf3d6SHong Zhang for (j=0; j<n; j++) { /* loop over columns*/ 1907b1683b59SHong Zhang col = is[j]; 1908d2853540SHong Zhang row_idx = cj + ci[col]; 1909b1683b59SHong Zhang m = ci[col+1] - ci[col]; 1910b7caf3d6SHong Zhang for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1911e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1912d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1913b1683b59SHong Zhang } 1914b1683b59SHong Zhang } 1915b1683b59SHong Zhang /* count the number of hits */ 1916b1683b59SHong Zhang nrows = 0; 1917e8354b3eSHong Zhang for (j=0; j<cm; j++) { 1918b1683b59SHong Zhang if (rowhit[j]) nrows++; 1919b1683b59SHong Zhang } 1920b1683b59SHong Zhang c->nrows[i] = nrows; 1921d2853540SHong Zhang colorforrow[i+1] = colorforrow[i] + nrows; 1922d2853540SHong Zhang 1923b1683b59SHong Zhang nrows = 0; 1924b7caf3d6SHong Zhang for (j=0; j<cm; j++) { /* loop over rows */ 1925b1683b59SHong Zhang if (rowhit[j]) { 1926d2853540SHong Zhang rows_i[nrows] = j; 192712b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1928b1683b59SHong Zhang nrows++; 1929b1683b59SHong Zhang } 1930b1683b59SHong Zhang } 1931e88777f2SHong Zhang den2sp_i += nrows; 1932077f23c2SHong Zhang 1933b1683b59SHong Zhang ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1934f99a636bSHong Zhang rows_i += nrows; 1935b1683b59SHong Zhang } 19360298fd71SBarry Smith ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1937b28d80bdSHong Zhang ierr = PetscFree(rowhit);CHKERRQ(ierr); 1938071fcb05SBarry Smith ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr); 1939d2853540SHong Zhang if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]); 1940b28d80bdSHong Zhang 1941d2853540SHong Zhang c->colorforrow = colorforrow; 1942d2853540SHong Zhang c->rows = rows; 1943f99a636bSHong Zhang c->den2sp = den2sp; 1944d2853540SHong Zhang c->colorforcol = colorforcol; 1945d2853540SHong Zhang c->columns = columns; 19462205254eSKarl Rupp 1947f94ccd6cSHong Zhang ierr = PetscFree(idxhit);CHKERRQ(ierr); 1948b1683b59SHong Zhang PetscFunctionReturn(0); 1949b1683b59SHong Zhang } 1950d013fc79Sandi selinger 19514222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19524222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1953df97dc6dSFande Kong { 19544222ddf1SHong Zhang PetscErrorCode ierr; 19554222ddf1SHong Zhang Mat_Product *product = C->product; 19564222ddf1SHong Zhang Mat A=product->A,B=product->B; 19574222ddf1SHong Zhang 1958df97dc6dSFande Kong PetscFunctionBegin; 19594222ddf1SHong Zhang if (C->ops->mattransposemultnumeric) { 19604222ddf1SHong Zhang /* Alg: "outerproduct" */ 19616718818eSStefano Zampini ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr); 19624222ddf1SHong Zhang } else { 19634222ddf1SHong Zhang /* Alg: "matmatmult" -- C = At*B */ 19646718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 19656718818eSStefano Zampini Mat At; 19664222ddf1SHong Zhang 19676718818eSStefano Zampini if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 19686718818eSStefano Zampini At = atb->At; 1969089a957eSStefano Zampini if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */ 19704222ddf1SHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 19714222ddf1SHong Zhang } 1972089a957eSStefano Zampini ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr); 19734222ddf1SHong Zhang atb->updateAt = PETSC_TRUE; 19744222ddf1SHong Zhang } 1975df97dc6dSFande Kong PetscFunctionReturn(0); 1976df97dc6dSFande Kong } 1977df97dc6dSFande Kong 19784222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1979d013fc79Sandi selinger { 1980d013fc79Sandi selinger PetscErrorCode ierr; 19814222ddf1SHong Zhang Mat_Product *product = C->product; 19824222ddf1SHong Zhang Mat A=product->A,B=product->B; 19834222ddf1SHong Zhang PetscReal fill=product->fill; 1984d013fc79Sandi selinger 1985d013fc79Sandi selinger PetscFunctionBegin; 19864222ddf1SHong Zhang ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 19872869b61bSandi selinger 19884222ddf1SHong Zhang C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 19894222ddf1SHong Zhang PetscFunctionReturn(0); 19902869b61bSandi selinger } 1991d013fc79Sandi selinger 19924222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19934222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 19944222ddf1SHong Zhang { 19954222ddf1SHong Zhang PetscErrorCode ierr; 19964222ddf1SHong Zhang Mat_Product *product = C->product; 19974222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 19984222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 19994222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 20004222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 20014222ddf1SHong Zhang PetscInt nalg = 7; 20024222ddf1SHong Zhang #else 20034222ddf1SHong Zhang const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"}; 20044222ddf1SHong Zhang PetscInt nalg = 8; 20054222ddf1SHong Zhang #endif 20064222ddf1SHong Zhang 20074222ddf1SHong Zhang PetscFunctionBegin; 20084222ddf1SHong Zhang /* Set default algorithm */ 20094222ddf1SHong Zhang ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 20104222ddf1SHong Zhang if (flg) { 20114222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2012d013fc79Sandi selinger } 2013d013fc79Sandi selinger 20144222ddf1SHong Zhang /* Get runtime option */ 20154222ddf1SHong Zhang if (product->api_user) { 20164222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 20174222ddf1SHong Zhang ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 20184222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20194222ddf1SHong Zhang } else { 20204222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 20214222ddf1SHong Zhang ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 20224222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 2023d013fc79Sandi selinger } 20244222ddf1SHong Zhang if (flg) { 20254222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2026d013fc79Sandi selinger } 2027d013fc79Sandi selinger 20284222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 20294222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 20304222ddf1SHong Zhang PetscFunctionReturn(0); 20314222ddf1SHong Zhang } 2032d013fc79Sandi selinger 20334222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 20344222ddf1SHong Zhang { 20354222ddf1SHong Zhang PetscErrorCode ierr; 20364222ddf1SHong Zhang Mat_Product *product = C->product; 20374222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20384222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 2039089a957eSStefano Zampini const char *algTypes[3] = {"default","at*b","outerproduct"}; 2040089a957eSStefano Zampini PetscInt nalg = 3; 2041d013fc79Sandi selinger 20424222ddf1SHong Zhang PetscFunctionBegin; 20434222ddf1SHong Zhang /* Get runtime option */ 20444222ddf1SHong Zhang if (product->api_user) { 20454222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 20464222ddf1SHong Zhang ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20474222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20484222ddf1SHong Zhang } else { 20494222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 20504222ddf1SHong Zhang ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20514222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20524222ddf1SHong Zhang } 20534222ddf1SHong Zhang if (flg) { 20544222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20554222ddf1SHong Zhang } 2056d013fc79Sandi selinger 20574222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 20584222ddf1SHong Zhang PetscFunctionReturn(0); 20594222ddf1SHong Zhang } 20604222ddf1SHong Zhang 20614222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 20624222ddf1SHong Zhang { 20634222ddf1SHong Zhang PetscErrorCode ierr; 20644222ddf1SHong Zhang Mat_Product *product = C->product; 20654222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20664222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20674222ddf1SHong Zhang const char *algTypes[2] = {"default","color"}; 20684222ddf1SHong Zhang PetscInt nalg = 2; 20694222ddf1SHong Zhang 20704222ddf1SHong Zhang PetscFunctionBegin; 20714222ddf1SHong Zhang /* Set default algorithm */ 20724222ddf1SHong Zhang ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 20734222ddf1SHong Zhang if (!flg) { 20744222ddf1SHong Zhang alg = 1; 20754222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20764222ddf1SHong Zhang } 20774222ddf1SHong Zhang 20784222ddf1SHong Zhang /* Get runtime option */ 20794222ddf1SHong Zhang if (product->api_user) { 20804222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr); 20814222ddf1SHong Zhang ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20824222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20834222ddf1SHong Zhang } else { 20844222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr); 20854222ddf1SHong Zhang ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20864222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20874222ddf1SHong Zhang } 20884222ddf1SHong Zhang if (flg) { 20894222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20904222ddf1SHong Zhang } 20914222ddf1SHong Zhang 20924222ddf1SHong Zhang C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 20934222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABt; 20944222ddf1SHong Zhang PetscFunctionReturn(0); 20954222ddf1SHong Zhang } 20964222ddf1SHong Zhang 20974222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 20984222ddf1SHong Zhang { 20994222ddf1SHong Zhang PetscErrorCode ierr; 21004222ddf1SHong Zhang Mat_Product *product = C->product; 21014222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21024222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 21034222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 21044222ddf1SHong Zhang const char *algTypes[2] = {"scalable","rap"}; 21054222ddf1SHong Zhang PetscInt nalg = 2; 21064222ddf1SHong Zhang #else 21074222ddf1SHong Zhang const char *algTypes[3] = {"scalable","rap","hypre"}; 21084222ddf1SHong Zhang PetscInt nalg = 3; 21094222ddf1SHong Zhang #endif 21104222ddf1SHong Zhang 21114222ddf1SHong Zhang PetscFunctionBegin; 21124222ddf1SHong Zhang /* Set default algorithm */ 21134222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21144222ddf1SHong Zhang if (flg) { 21154222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21164222ddf1SHong Zhang } 21174222ddf1SHong Zhang 21184222ddf1SHong Zhang /* Get runtime option */ 21194222ddf1SHong Zhang if (product->api_user) { 21204222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 21214222ddf1SHong Zhang ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21224222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21234222ddf1SHong Zhang } else { 21244222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 21254222ddf1SHong Zhang ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21264222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21274222ddf1SHong Zhang } 21284222ddf1SHong Zhang if (flg) { 21294222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21304222ddf1SHong Zhang } 21314222ddf1SHong Zhang 21324222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 21334222ddf1SHong Zhang PetscFunctionReturn(0); 21344222ddf1SHong Zhang } 21354222ddf1SHong Zhang 21364222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 21374222ddf1SHong Zhang { 21384222ddf1SHong Zhang PetscErrorCode ierr; 21394222ddf1SHong Zhang Mat_Product *product = C->product; 21404222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21414222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21424222ddf1SHong Zhang const char *algTypes[3] = {"r*a*rt","r*art","coloring_rart"}; 21434222ddf1SHong Zhang PetscInt nalg = 3; 21444222ddf1SHong Zhang 21454222ddf1SHong Zhang PetscFunctionBegin; 21464222ddf1SHong Zhang /* Set default algorithm */ 21474222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21484222ddf1SHong Zhang if (flg) { 21494222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21504222ddf1SHong Zhang } 21514222ddf1SHong Zhang 21524222ddf1SHong Zhang /* Get runtime option */ 21534222ddf1SHong Zhang if (product->api_user) { 21544222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 21554222ddf1SHong Zhang ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21564222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21574222ddf1SHong Zhang } else { 21584222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 21594222ddf1SHong Zhang ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21604222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21614222ddf1SHong Zhang } 21624222ddf1SHong Zhang if (flg) { 21634222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21644222ddf1SHong Zhang } 21654222ddf1SHong Zhang 21664222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 21674222ddf1SHong Zhang PetscFunctionReturn(0); 21684222ddf1SHong Zhang } 21694222ddf1SHong Zhang 21704222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 21714222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 21724222ddf1SHong Zhang { 21734222ddf1SHong Zhang PetscErrorCode ierr; 21744222ddf1SHong Zhang Mat_Product *product = C->product; 21754222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21764222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21774222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 21784222ddf1SHong Zhang PetscInt nalg = 7; 21794222ddf1SHong Zhang 21804222ddf1SHong Zhang PetscFunctionBegin; 21814222ddf1SHong Zhang /* Set default algorithm */ 21824222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21834222ddf1SHong Zhang if (flg) { 21844222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21854222ddf1SHong Zhang } 21864222ddf1SHong Zhang 21874222ddf1SHong Zhang /* Get runtime option */ 21884222ddf1SHong Zhang if (product->api_user) { 21894222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 21904222ddf1SHong Zhang ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 21914222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21924222ddf1SHong Zhang } else { 21934222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 21944222ddf1SHong Zhang ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 21954222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21964222ddf1SHong Zhang } 21974222ddf1SHong Zhang if (flg) { 21984222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21994222ddf1SHong Zhang } 22004222ddf1SHong Zhang 22014222ddf1SHong Zhang C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 22024222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABC; 22034222ddf1SHong Zhang PetscFunctionReturn(0); 22044222ddf1SHong Zhang } 22054222ddf1SHong Zhang 22064222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 22074222ddf1SHong Zhang { 22084222ddf1SHong Zhang PetscErrorCode ierr; 22094222ddf1SHong Zhang Mat_Product *product = C->product; 22104222ddf1SHong Zhang 22114222ddf1SHong Zhang PetscFunctionBegin; 22124222ddf1SHong Zhang switch (product->type) { 22134222ddf1SHong Zhang case MATPRODUCT_AB: 22144222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr); 22154222ddf1SHong Zhang break; 22164222ddf1SHong Zhang case MATPRODUCT_AtB: 22174222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr); 22184222ddf1SHong Zhang break; 22194222ddf1SHong Zhang case MATPRODUCT_ABt: 22204222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr); 22214222ddf1SHong Zhang break; 22224222ddf1SHong Zhang case MATPRODUCT_PtAP: 22234222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr); 22244222ddf1SHong Zhang break; 22254222ddf1SHong Zhang case MATPRODUCT_RARt: 22264222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr); 22274222ddf1SHong Zhang break; 22284222ddf1SHong Zhang case MATPRODUCT_ABC: 22294222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr); 22304222ddf1SHong Zhang break; 22316718818eSStefano Zampini default: 22326718818eSStefano Zampini break; 22334222ddf1SHong Zhang } 2234d013fc79Sandi selinger PetscFunctionReturn(0); 2235d013fc79Sandi selinger } 2236