1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 14df97dc6dSFande Kong { 15df97dc6dSFande Kong PetscFunctionBegin; 16df97dc6dSFande Kong if (C->ops->matmultnumeric) { 172c71b3e2SJacob Faibussowitsch PetscCheckFalse(C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call"); 185f80ce2aSJacob Faibussowitsch CHKERRQ((*C->ops->matmultnumeric)(A,B,C)); 19df97dc6dSFande Kong } else { 205f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C)); 21df97dc6dSFande Kong } 22df97dc6dSFande Kong PetscFunctionReturn(0); 23df97dc6dSFande Kong } 24df97dc6dSFande Kong 254222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */ 26e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat) 27df97dc6dSFande Kong { 284222ddf1SHong Zhang PetscInt ii; 294222ddf1SHong Zhang Mat_SeqAIJ *aij; 30cbc6b225SStefano Zampini PetscBool isseqaij, osingle, ofree_a, ofree_ij; 315c66b693SKris Buschelman 325c66b693SKris Buschelman PetscFunctionBegin; 332c71b3e2SJacob Faibussowitsch PetscCheckFalse(m > 0 && i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0"); 345f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(mat,m,n,m,n)); 354222ddf1SHong Zhang 36e4e71118SStefano Zampini if (!mtype) { 375f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij)); 385f80ce2aSJacob Faibussowitsch if (!isseqaij) CHKERRQ(MatSetType(mat,MATSEQAIJ)); 39e4e71118SStefano Zampini } else { 405f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(mat,mtype)); 41e4e71118SStefano Zampini } 42cbc6b225SStefano Zampini 434222ddf1SHong Zhang aij = (Mat_SeqAIJ*)(mat)->data; 44cbc6b225SStefano Zampini osingle = aij->singlemalloc; 45cbc6b225SStefano Zampini ofree_a = aij->free_a; 46cbc6b225SStefano Zampini ofree_ij = aij->free_ij; 47cbc6b225SStefano Zampini /* changes the free flags */ 485f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL)); 49cbc6b225SStefano Zampini 505f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(aij->ilen)); 515f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(aij->imax)); 525f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&aij->imax)); 535f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&aij->ilen)); 54cbc6b225SStefano Zampini for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) { 55cbc6b225SStefano Zampini const PetscInt rnz = i[ii+1] - i[ii]; 56cbc6b225SStefano Zampini aij->nonzerorowcnt += !!rnz; 57cbc6b225SStefano Zampini aij->rmax = PetscMax(aij->rmax,rnz); 58cbc6b225SStefano Zampini aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii]; 59cbc6b225SStefano Zampini } 60cbc6b225SStefano Zampini aij->maxnz = i[m]; 61cbc6b225SStefano Zampini aij->nz = i[m]; 624222ddf1SHong Zhang 63cbc6b225SStefano Zampini if (osingle) { 645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree3(aij->a,aij->j,aij->i)); 65cbc6b225SStefano Zampini } else { 665f80ce2aSJacob Faibussowitsch if (ofree_a) CHKERRQ(PetscFree(aij->a)); 675f80ce2aSJacob Faibussowitsch if (ofree_ij) CHKERRQ(PetscFree(aij->j)); 685f80ce2aSJacob Faibussowitsch if (ofree_ij) CHKERRQ(PetscFree(aij->i)); 69cbc6b225SStefano Zampini } 704222ddf1SHong Zhang aij->i = i; 714222ddf1SHong Zhang aij->j = j; 724222ddf1SHong Zhang aij->a = a; 734222ddf1SHong Zhang aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 74cbc6b225SStefano Zampini /* default to not retain ownership */ 75cbc6b225SStefano Zampini aij->singlemalloc = PETSC_FALSE; 764222ddf1SHong Zhang aij->free_a = PETSC_FALSE; 774222ddf1SHong Zhang aij->free_ij = PETSC_FALSE; 785f80ce2aSJacob Faibussowitsch CHKERRQ(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6)); 795c66b693SKris Buschelman PetscFunctionReturn(0); 805c66b693SKris Buschelman } 811c24bd37SHong Zhang 824222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 834222ddf1SHong Zhang { 844222ddf1SHong Zhang Mat_Product *product = C->product; 854222ddf1SHong Zhang MatProductAlgorithm alg; 864222ddf1SHong Zhang PetscBool flg; 874222ddf1SHong Zhang 884222ddf1SHong Zhang PetscFunctionBegin; 894222ddf1SHong Zhang if (product) { 904222ddf1SHong Zhang alg = product->alg; 914222ddf1SHong Zhang } else { 924222ddf1SHong Zhang alg = "sorted"; 934222ddf1SHong Zhang } 944222ddf1SHong Zhang /* sorted */ 955f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"sorted",&flg)); 964222ddf1SHong Zhang if (flg) { 975f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C)); 984222ddf1SHong Zhang PetscFunctionReturn(0); 994222ddf1SHong Zhang } 1004222ddf1SHong Zhang 1014222ddf1SHong Zhang /* scalable */ 1025f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"scalable",&flg)); 1034222ddf1SHong Zhang if (flg) { 1045f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C)); 1054222ddf1SHong Zhang PetscFunctionReturn(0); 1064222ddf1SHong Zhang } 1074222ddf1SHong Zhang 1084222ddf1SHong Zhang /* scalable_fast */ 1095f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"scalable_fast",&flg)); 1104222ddf1SHong Zhang if (flg) { 1115f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C)); 1124222ddf1SHong Zhang PetscFunctionReturn(0); 1134222ddf1SHong Zhang } 1144222ddf1SHong Zhang 1154222ddf1SHong Zhang /* heap */ 1165f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"heap",&flg)); 1174222ddf1SHong Zhang if (flg) { 1185f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C)); 1194222ddf1SHong Zhang PetscFunctionReturn(0); 1204222ddf1SHong Zhang } 1214222ddf1SHong Zhang 1224222ddf1SHong Zhang /* btheap */ 1235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"btheap",&flg)); 1244222ddf1SHong Zhang if (flg) { 1255f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C)); 1264222ddf1SHong Zhang PetscFunctionReturn(0); 1274222ddf1SHong Zhang } 1284222ddf1SHong Zhang 1294222ddf1SHong Zhang /* llcondensed */ 1305f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"llcondensed",&flg)); 1314222ddf1SHong Zhang if (flg) { 1325f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C)); 1334222ddf1SHong Zhang PetscFunctionReturn(0); 1344222ddf1SHong Zhang } 1354222ddf1SHong Zhang 1364222ddf1SHong Zhang /* rowmerge */ 1375f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"rowmerge",&flg)); 1384222ddf1SHong Zhang if (flg) { 1395f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C)); 1404222ddf1SHong Zhang PetscFunctionReturn(0); 1414222ddf1SHong Zhang } 1424222ddf1SHong Zhang 1434222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE) 1445f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(alg,"hypre",&flg)); 1454222ddf1SHong Zhang if (flg) { 1465f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C)); 1474222ddf1SHong Zhang PetscFunctionReturn(0); 1484222ddf1SHong Zhang } 1494222ddf1SHong Zhang #endif 1504222ddf1SHong Zhang 1514222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1524222ddf1SHong Zhang } 1534222ddf1SHong Zhang 1544222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C) 155b561aa9dSHong Zhang { 156b561aa9dSHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 157971236abSHong Zhang PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 158c1ba5575SJed Brown PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 159b561aa9dSHong Zhang PetscReal afill; 160eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 161eca6b86aSHong Zhang PetscTable ta; 162fb908031SHong Zhang PetscBT lnkbt; 1630298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 164b561aa9dSHong Zhang 165b561aa9dSHong Zhang PetscFunctionBegin; 166bd958071SHong Zhang /* Get ci and cj */ 167bd958071SHong Zhang /*---------------*/ 168fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 169fb908031SHong Zhang /* free space for accumulating nonzero column info */ 1705f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(am+2,&ci)); 171fb908031SHong Zhang ci[0] = 0; 172fb908031SHong Zhang 173fb908031SHong Zhang /* create and initialize a linked list */ 1745f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableCreate(bn,bn,&ta)); 175c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 1765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableGetCount(ta,&Crmax)); 1775f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableDestroy(&ta)); 178eca6b86aSHong Zhang 1795f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt)); 180fb908031SHong Zhang 181fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1825f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 1832205254eSKarl Rupp 184fb908031SHong Zhang current_space = free_space; 185fb908031SHong Zhang 186fb908031SHong Zhang /* Determine ci and cj */ 187fb908031SHong Zhang for (i=0; i<am; i++) { 188fb908031SHong Zhang anzi = ai[i+1] - ai[i]; 189fb908031SHong Zhang aj = a->j + ai[i]; 190fb908031SHong Zhang for (j=0; j<anzi; j++) { 191fb908031SHong Zhang brow = aj[j]; 192fb908031SHong Zhang bnzj = bi[brow+1] - bi[brow]; 193fb908031SHong Zhang bj = b->j + bi[brow]; 194fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 1955f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt)); 196fb908031SHong Zhang } 1978c78258cSHong Zhang /* add possible missing diagonal entry */ 1988c78258cSHong Zhang if (C->force_diagonals) { 1995f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt)); 2008c78258cSHong Zhang } 201fb908031SHong Zhang cnzi = lnk[0]; 202fb908031SHong Zhang 203fb908031SHong Zhang /* If free space is not available, make more free space */ 204fb908031SHong Zhang /* Double the amount of total space in the list */ 205fb908031SHong Zhang if (current_space->local_remaining<cnzi) { 2065f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space)); 207fb908031SHong Zhang ndouble++; 208fb908031SHong Zhang } 209fb908031SHong Zhang 210fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 2115f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt)); 2122205254eSKarl Rupp 213fb908031SHong Zhang current_space->array += cnzi; 214fb908031SHong Zhang current_space->local_used += cnzi; 215fb908031SHong Zhang current_space->local_remaining -= cnzi; 2162205254eSKarl Rupp 217fb908031SHong Zhang ci[i+1] = ci[i] + cnzi; 218fb908031SHong Zhang } 219fb908031SHong Zhang 220fb908031SHong Zhang /* Column indices are in the list of free space */ 221fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 222fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 2235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(ci[am]+1,&cj)); 2245f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceContiguous(&free_space,cj)); 2255f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedDestroy(lnk,lnkbt)); 226b561aa9dSHong Zhang 22726be0446SHong Zhang /* put together the new symbolic matrix */ 2285f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 2295f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(C,A,B)); 23058c24d83SHong Zhang 23158c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 23258c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 2334222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 234aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 235e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 23658c24d83SHong Zhang c->nonew = 0; 2374222ddf1SHong Zhang 2384222ddf1SHong Zhang /* fast, needs non-scalable O(bn) array 'abdense' */ 2394222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 2400b7e3e3dSHong Zhang 2417212da7cSHong Zhang /* set MatInfo */ 2427212da7cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 243f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 2444222ddf1SHong Zhang C->info.mallocs = ndouble; 2454222ddf1SHong Zhang C->info.fill_ratio_given = fill; 2464222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 2477212da7cSHong Zhang 2487212da7cSHong Zhang #if defined(PETSC_USE_INFO) 2497212da7cSHong Zhang if (ci[am]) { 2505f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 2515f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 252f2b054eeSHong Zhang } else { 2535f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Empty matrix product\n")); 254be0fcf8dSHong Zhang } 255f2b054eeSHong Zhang #endif 25658c24d83SHong Zhang PetscFunctionReturn(0); 25758c24d83SHong Zhang } 258d50806bdSBarry Smith 259df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 260d50806bdSBarry Smith { 261d13dce4bSSatish Balay PetscLogDouble flops=0.0; 262d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 263d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 264d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 26538baddfdSBarry Smith PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 266c58ca2e3SHong Zhang PetscInt am =A->rmap->n,cm=C->rmap->n; 267519eb980SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 2682e5835c6SStefano Zampini PetscScalar *ca,valtmp; 269aa1aec99SHong Zhang PetscScalar *ab_dense; 2706718818eSStefano Zampini PetscContainer cab_dense; 2712e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 272d50806bdSBarry Smith 273d50806bdSBarry Smith PetscFunctionBegin; 2745f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJGetArrayRead(A,&aa)); 2755f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJGetArrayRead(B,&ba)); 276aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 2775f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(ci[cm]+1,&ca)); 278aa1aec99SHong Zhang c->a = ca; 279aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 2806718818eSStefano Zampini } else ca = c->a; 2816718818eSStefano Zampini 2826718818eSStefano Zampini /* TODO this should be done in the symbolic phase */ 2836718818eSStefano Zampini /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 2846718818eSStefano Zampini that is hard to eradicate) */ 2855f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense)); 2866718818eSStefano Zampini if (!cab_dense) { 2875f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(B->cmap->N,&ab_dense)); 2885f80ce2aSJacob Faibussowitsch CHKERRQ(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense)); 2895f80ce2aSJacob Faibussowitsch CHKERRQ(PetscContainerSetPointer(cab_dense,ab_dense)); 2905f80ce2aSJacob Faibussowitsch CHKERRQ(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault)); 2915f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense)); 2925f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectDereference((PetscObject)cab_dense)); 293d90d86d0SMatthew G. Knepley } 2945f80ce2aSJacob Faibussowitsch CHKERRQ(PetscContainerGetPointer(cab_dense,(void**)&ab_dense)); 2955f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(ab_dense,B->cmap->N)); 296aa1aec99SHong Zhang 297c124e916SHong Zhang /* clean old values in C */ 2985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(ca,ci[cm])); 299d50806bdSBarry Smith /* Traverse A row-wise. */ 300d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 301d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 302d50806bdSBarry Smith for (i=0; i<am; i++) { 303d50806bdSBarry Smith anzi = ai[i+1] - ai[i]; 304d50806bdSBarry Smith for (j=0; j<anzi; j++) { 305519eb980SHong Zhang brow = aj[j]; 306d50806bdSBarry Smith bnzi = bi[brow+1] - bi[brow]; 307d50806bdSBarry Smith bjj = bj + bi[brow]; 308d50806bdSBarry Smith baj = ba + bi[brow]; 309519eb980SHong Zhang /* perform dense axpy */ 31036ec6d2dSHong Zhang valtmp = aa[j]; 311519eb980SHong Zhang for (k=0; k<bnzi; k++) { 31236ec6d2dSHong Zhang ab_dense[bjj[k]] += valtmp*baj[k]; 313519eb980SHong Zhang } 314519eb980SHong Zhang flops += 2*bnzi; 315519eb980SHong Zhang } 316c58ca2e3SHong Zhang aj += anzi; aa += anzi; 317c58ca2e3SHong Zhang 318c58ca2e3SHong Zhang cnzi = ci[i+1] - ci[i]; 319519eb980SHong Zhang for (k=0; k<cnzi; k++) { 320519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 321519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 322519eb980SHong Zhang } 323519eb980SHong Zhang flops += cnzi; 324519eb980SHong Zhang cj += cnzi; ca += cnzi; 325519eb980SHong Zhang } 3262e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3272e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3282e5835c6SStefano Zampini #endif 3295f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 3305f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 3315f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogFlops(flops)); 3325f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJRestoreArrayRead(A,&aa)); 3335f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJRestoreArrayRead(B,&ba)); 334c58ca2e3SHong Zhang PetscFunctionReturn(0); 335c58ca2e3SHong Zhang } 336c58ca2e3SHong Zhang 33725023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 338c58ca2e3SHong Zhang { 339c58ca2e3SHong Zhang PetscLogDouble flops=0.0; 340c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 341c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 342c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 343c58ca2e3SHong Zhang PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 344c58ca2e3SHong Zhang PetscInt am = A->rmap->N,cm=C->rmap->N; 345c58ca2e3SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 3462e5835c6SStefano Zampini PetscScalar *ca=c->a,valtmp; 3472e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 348c58ca2e3SHong Zhang PetscInt nextb; 349c58ca2e3SHong Zhang 350c58ca2e3SHong Zhang PetscFunctionBegin; 3515f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJGetArrayRead(A,&aa)); 3525f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJGetArrayRead(B,&ba)); 353cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 3545f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(ci[cm]+1,&ca)); 355cf742fccSHong Zhang c->a = ca; 356cf742fccSHong Zhang c->free_a = PETSC_TRUE; 357cf742fccSHong Zhang } 358cf742fccSHong Zhang 359c58ca2e3SHong Zhang /* clean old values in C */ 3605f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(ca,ci[cm])); 361c58ca2e3SHong Zhang /* Traverse A row-wise. */ 362c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 363c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 364519eb980SHong Zhang for (i=0; i<am; i++) { 365519eb980SHong Zhang anzi = ai[i+1] - ai[i]; 366519eb980SHong Zhang cnzi = ci[i+1] - ci[i]; 367519eb980SHong Zhang for (j=0; j<anzi; j++) { 368519eb980SHong Zhang brow = aj[j]; 369519eb980SHong Zhang bnzi = bi[brow+1] - bi[brow]; 370519eb980SHong Zhang bjj = bj + bi[brow]; 371519eb980SHong Zhang baj = ba + bi[brow]; 372519eb980SHong Zhang /* perform sparse axpy */ 37336ec6d2dSHong Zhang valtmp = aa[j]; 374c124e916SHong Zhang nextb = 0; 375c124e916SHong Zhang for (k=0; nextb<bnzi; k++) { 376c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 37736ec6d2dSHong Zhang ca[k] += valtmp*baj[nextb++]; 378c124e916SHong Zhang } 379d50806bdSBarry Smith } 380d50806bdSBarry Smith flops += 2*bnzi; 381d50806bdSBarry Smith } 382519eb980SHong Zhang aj += anzi; aa += anzi; 383519eb980SHong Zhang cj += cnzi; ca += cnzi; 384d50806bdSBarry Smith } 3852e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3862e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3872e5835c6SStefano Zampini #endif 3885f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 3895f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 3905f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogFlops(flops)); 3915f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJRestoreArrayRead(A,&aa)); 3925f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJRestoreArrayRead(B,&ba)); 393d50806bdSBarry Smith PetscFunctionReturn(0); 394d50806bdSBarry Smith } 395bc011b1eSHong Zhang 3964222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C) 39725296bd5SBarry Smith { 39825296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 39925296bd5SBarry Smith PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 4003c50cad2SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 40125296bd5SBarry Smith MatScalar *ca; 40225296bd5SBarry Smith PetscReal afill; 403eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 404eca6b86aSHong Zhang PetscTable ta; 4050298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 40625296bd5SBarry Smith 40725296bd5SBarry Smith PetscFunctionBegin; 4083c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 4093c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 4103c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 4115f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(am+2,&ci)); 4123c50cad2SHong Zhang ci[0] = 0; 4133c50cad2SHong Zhang 4143c50cad2SHong Zhang /* create and initialize a linked list */ 4155f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableCreate(bn,bn,&ta)); 416c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 4175f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableGetCount(ta,&Crmax)); 4185f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableDestroy(&ta)); 419eca6b86aSHong Zhang 4205f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedCreate_fast(Crmax,&lnk)); 4213c50cad2SHong Zhang 4223c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 4235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 4243c50cad2SHong Zhang current_space = free_space; 4253c50cad2SHong Zhang 4263c50cad2SHong Zhang /* Determine ci and cj */ 4273c50cad2SHong Zhang for (i=0; i<am; i++) { 4283c50cad2SHong Zhang anzi = ai[i+1] - ai[i]; 4293c50cad2SHong Zhang aj = a->j + ai[i]; 4303c50cad2SHong Zhang for (j=0; j<anzi; j++) { 4313c50cad2SHong Zhang brow = aj[j]; 4323c50cad2SHong Zhang bnzj = bi[brow+1] - bi[brow]; 4333c50cad2SHong Zhang bj = b->j + bi[brow]; 4343c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 4355f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk)); 4363c50cad2SHong Zhang } 4378c78258cSHong Zhang /* add possible missing diagonal entry */ 4388c78258cSHong Zhang if (C->force_diagonals) { 4395f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedAddSorted_fast(1,&i,lnk)); 4408c78258cSHong Zhang } 4413c50cad2SHong Zhang cnzi = lnk[1]; 4423c50cad2SHong Zhang 4433c50cad2SHong Zhang /* If free space is not available, make more free space */ 4443c50cad2SHong Zhang /* Double the amount of total space in the list */ 4453c50cad2SHong Zhang if (current_space->local_remaining<cnzi) { 4465f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space)); 4473c50cad2SHong Zhang ndouble++; 4483c50cad2SHong Zhang } 4493c50cad2SHong Zhang 4503c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 4515f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk)); 4522205254eSKarl Rupp 4533c50cad2SHong Zhang current_space->array += cnzi; 4543c50cad2SHong Zhang current_space->local_used += cnzi; 4553c50cad2SHong Zhang current_space->local_remaining -= cnzi; 4562205254eSKarl Rupp 4573c50cad2SHong Zhang ci[i+1] = ci[i] + cnzi; 4583c50cad2SHong Zhang } 4593c50cad2SHong Zhang 4603c50cad2SHong Zhang /* Column indices are in the list of free space */ 4613c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 4623c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 4635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(ci[am]+1,&cj)); 4645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceContiguous(&free_space,cj)); 4655f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedDestroy_fast(lnk)); 46625296bd5SBarry Smith 46725296bd5SBarry Smith /* Allocate space for ca */ 4685f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc1(ci[am]+1,&ca)); 46925296bd5SBarry Smith 47025296bd5SBarry Smith /* put together the new symbolic matrix */ 4715f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C)); 4725f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(C,A,B)); 47325296bd5SBarry Smith 47425296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 47525296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 4764222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 47725296bd5SBarry Smith c->free_a = PETSC_TRUE; 47825296bd5SBarry Smith c->free_ij = PETSC_TRUE; 47925296bd5SBarry Smith c->nonew = 0; 4802205254eSKarl Rupp 4814222ddf1SHong Zhang /* slower, less memory */ 4824222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 48325296bd5SBarry Smith 48425296bd5SBarry Smith /* set MatInfo */ 48525296bd5SBarry Smith afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 48625296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 4874222ddf1SHong Zhang C->info.mallocs = ndouble; 4884222ddf1SHong Zhang C->info.fill_ratio_given = fill; 4894222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 49025296bd5SBarry Smith 49125296bd5SBarry Smith #if defined(PETSC_USE_INFO) 49225296bd5SBarry Smith if (ci[am]) { 4935f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 4945f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 49525296bd5SBarry Smith } else { 4965f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Empty matrix product\n")); 49725296bd5SBarry Smith } 49825296bd5SBarry Smith #endif 49925296bd5SBarry Smith PetscFunctionReturn(0); 50025296bd5SBarry Smith } 50125296bd5SBarry Smith 5024222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C) 503e9e4536cSHong Zhang { 504e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 505bf9555e6SHong Zhang PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 50625c41797SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 507e9e4536cSHong Zhang MatScalar *ca; 508e9e4536cSHong Zhang PetscReal afill; 509eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 510eca6b86aSHong Zhang PetscTable ta; 5110298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 512e9e4536cSHong Zhang 513e9e4536cSHong Zhang PetscFunctionBegin; 514bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 515bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 516bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 5175f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(am+2,&ci)); 518bd958071SHong Zhang ci[0] = 0; 519bd958071SHong Zhang 520bd958071SHong Zhang /* create and initialize a linked list */ 5215f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableCreate(bn,bn,&ta)); 522c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 5235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableGetCount(ta,&Crmax)); 5245f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableDestroy(&ta)); 5255f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedCreate_Scalable(Crmax,&lnk)); 526bd958071SHong Zhang 527bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 5285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 529bd958071SHong Zhang current_space = free_space; 530bd958071SHong Zhang 531bd958071SHong Zhang /* Determine ci and cj */ 532bd958071SHong Zhang for (i=0; i<am; i++) { 533bd958071SHong Zhang anzi = ai[i+1] - ai[i]; 534bd958071SHong Zhang aj = a->j + ai[i]; 535bd958071SHong Zhang for (j=0; j<anzi; j++) { 536bd958071SHong Zhang brow = aj[j]; 537bd958071SHong Zhang bnzj = bi[brow+1] - bi[brow]; 538bd958071SHong Zhang bj = b->j + bi[brow]; 539bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 5405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk)); 541bd958071SHong Zhang } 5428c78258cSHong Zhang /* add possible missing diagonal entry */ 5438c78258cSHong Zhang if (C->force_diagonals) { 5445f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedAddSorted_Scalable(1,&i,lnk)); 5458c78258cSHong Zhang } 5468c78258cSHong Zhang 547bd958071SHong Zhang cnzi = lnk[0]; 548bd958071SHong Zhang 549bd958071SHong Zhang /* If free space is not available, make more free space */ 550bd958071SHong Zhang /* Double the amount of total space in the list */ 551bd958071SHong Zhang if (current_space->local_remaining<cnzi) { 5525f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space)); 553bd958071SHong Zhang ndouble++; 554bd958071SHong Zhang } 555bd958071SHong Zhang 556bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 5575f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk)); 5582205254eSKarl Rupp 559bd958071SHong Zhang current_space->array += cnzi; 560bd958071SHong Zhang current_space->local_used += cnzi; 561bd958071SHong Zhang current_space->local_remaining -= cnzi; 5622205254eSKarl Rupp 563bd958071SHong Zhang ci[i+1] = ci[i] + cnzi; 564bd958071SHong Zhang } 565bd958071SHong Zhang 566bd958071SHong Zhang /* Column indices are in the list of free space */ 567bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 568bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 5695f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(ci[am]+1,&cj)); 5705f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceContiguous(&free_space,cj)); 5715f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLLCondensedDestroy_Scalable(lnk)); 572e9e4536cSHong Zhang 573e9e4536cSHong Zhang /* Allocate space for ca */ 574bd958071SHong Zhang /*-----------------------*/ 5755f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc1(ci[am]+1,&ca)); 576e9e4536cSHong Zhang 577e9e4536cSHong Zhang /* put together the new symbolic matrix */ 5785f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C)); 5795f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(C,A,B)); 580e9e4536cSHong Zhang 581e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 582e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5834222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 584e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 585e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 586e9e4536cSHong Zhang c->nonew = 0; 5872205254eSKarl Rupp 5884222ddf1SHong Zhang /* slower, less memory */ 5894222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 590e9e4536cSHong Zhang 591e9e4536cSHong Zhang /* set MatInfo */ 592e9e4536cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 593e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 5944222ddf1SHong Zhang C->info.mallocs = ndouble; 5954222ddf1SHong Zhang C->info.fill_ratio_given = fill; 5964222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 597e9e4536cSHong Zhang 598e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 599e9e4536cSHong Zhang if (ci[am]) { 6005f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 6015f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 602e9e4536cSHong Zhang } else { 6035f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Empty matrix product\n")); 604e9e4536cSHong Zhang } 605e9e4536cSHong Zhang #endif 606e9e4536cSHong Zhang PetscFunctionReturn(0); 607e9e4536cSHong Zhang } 608e9e4536cSHong Zhang 6094222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C) 6100ced3a2bSJed Brown { 6110ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 6120ced3a2bSJed Brown const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 6130ced3a2bSJed Brown PetscInt *ci,*cj,*bb; 6140ced3a2bSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 6150ced3a2bSJed Brown PetscReal afill; 6160ced3a2bSJed Brown PetscInt i,j,col,ndouble = 0; 6170298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 6180ced3a2bSJed Brown PetscHeap h; 6190ced3a2bSJed Brown 6200ced3a2bSJed Brown PetscFunctionBegin; 621cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 6220ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 6230ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 6245f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(am+2,&ci)); 6250ced3a2bSJed Brown ci[0] = 0; 6260ced3a2bSJed Brown 6270ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 6285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 6290ced3a2bSJed Brown current_space = free_space; 6300ced3a2bSJed Brown 6315f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapCreate(a->rmax,&h)); 6325f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(a->rmax,&bb)); 6330ced3a2bSJed Brown 6340ced3a2bSJed Brown /* Determine ci and cj */ 6350ced3a2bSJed Brown for (i=0; i<am; i++) { 6360ced3a2bSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6370ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6380ced3a2bSJed Brown ci[i+1] = ci[i]; 6390ced3a2bSJed Brown /* Populate the min heap */ 6400ced3a2bSJed Brown for (j=0; j<anzi; j++) { 6410ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 6420ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 6435f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapAdd(h,j,bj[bb[j]++])); 6440ced3a2bSJed Brown } 6450ced3a2bSJed Brown } 6460ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 6475f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapPop(h,&j,&col)); 6480ced3a2bSJed Brown while (j >= 0) { 6490ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 6505f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space)); 6510ced3a2bSJed Brown ndouble++; 6520ced3a2bSJed Brown } 6530ced3a2bSJed Brown *(current_space->array++) = col; 6540ced3a2bSJed Brown current_space->local_used++; 6550ced3a2bSJed Brown current_space->local_remaining--; 6560ced3a2bSJed Brown ci[i+1]++; 6570ced3a2bSJed Brown 6580ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 6595f80ce2aSJacob Faibussowitsch if (bb[j] < bi[acol[j]+1]) CHKERRQ(PetscHeapStash(h,j,bj[bb[j]++])); 6600ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 6610ced3a2bSJed Brown PetscInt j2,col2; 6625f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapPeek(h,&j2,&col2)); 6630ced3a2bSJed Brown if (col2 != col) break; 6645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapPop(h,&j2,&col2)); 6655f80ce2aSJacob Faibussowitsch if (bb[j2] < bi[acol[j2]+1]) CHKERRQ(PetscHeapStash(h,j2,bj[bb[j2]++])); 6660ced3a2bSJed Brown } 6670ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 6685f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapUnstash(h)); 6695f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapPop(h,&j,&col)); 6700ced3a2bSJed Brown } 6710ced3a2bSJed Brown } 6725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(bb)); 6735f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapDestroy(&h)); 6740ced3a2bSJed Brown 6750ced3a2bSJed Brown /* Column indices are in the list of free space */ 6760ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 6770ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 6785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(ci[am],&cj)); 6795f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceContiguous(&free_space,cj)); 6800ced3a2bSJed Brown 6810ced3a2bSJed Brown /* put together the new symbolic matrix */ 6825f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 6835f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(C,A,B)); 6840ced3a2bSJed Brown 6850ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6860ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 6874222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 6880ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6890ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 6900ced3a2bSJed Brown c->nonew = 0; 69126fbe8dcSKarl Rupp 6924222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 6930ced3a2bSJed Brown 6940ced3a2bSJed Brown /* set MatInfo */ 6950ced3a2bSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 6960ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6974222ddf1SHong Zhang C->info.mallocs = ndouble; 6984222ddf1SHong Zhang C->info.fill_ratio_given = fill; 6994222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 7000ced3a2bSJed Brown 7010ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 7020ced3a2bSJed Brown if (ci[am]) { 7035f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 7045f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 7050ced3a2bSJed Brown } else { 7065f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Empty matrix product\n")); 7070ced3a2bSJed Brown } 7080ced3a2bSJed Brown #endif 7090ced3a2bSJed Brown PetscFunctionReturn(0); 7100ced3a2bSJed Brown } 711e9e4536cSHong Zhang 7124222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C) 7138a07c6f1SJed Brown { 7148a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 7158a07c6f1SJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 7168a07c6f1SJed Brown PetscInt *ci,*cj,*bb; 7178a07c6f1SJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 7188a07c6f1SJed Brown PetscReal afill; 7198a07c6f1SJed Brown PetscInt i,j,col,ndouble = 0; 7200298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 7218a07c6f1SJed Brown PetscHeap h; 7228a07c6f1SJed Brown PetscBT bt; 7238a07c6f1SJed Brown 7248a07c6f1SJed Brown PetscFunctionBegin; 725cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 7268a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 7278a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 7285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(am+2,&ci)); 7298a07c6f1SJed Brown ci[0] = 0; 7308a07c6f1SJed Brown 7318a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 7325f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space)); 7332205254eSKarl Rupp 7348a07c6f1SJed Brown current_space = free_space; 7358a07c6f1SJed Brown 7365f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapCreate(a->rmax,&h)); 7375f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(a->rmax,&bb)); 7385f80ce2aSJacob Faibussowitsch CHKERRQ(PetscBTCreate(bn,&bt)); 7398a07c6f1SJed Brown 7408a07c6f1SJed Brown /* Determine ci and cj */ 7418a07c6f1SJed Brown for (i=0; i<am; i++) { 7428a07c6f1SJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 7438a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 7448a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 7458a07c6f1SJed Brown ci[i+1] = ci[i]; 7468a07c6f1SJed Brown /* Populate the min heap */ 7478a07c6f1SJed Brown for (j=0; j<anzi; j++) { 7488a07c6f1SJed Brown PetscInt brow = acol[j]; 7498a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 7508a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7518a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7525f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapAdd(h,j,bcol)); 7538a07c6f1SJed Brown bb[j]++; 7548a07c6f1SJed Brown break; 7558a07c6f1SJed Brown } 7568a07c6f1SJed Brown } 7578a07c6f1SJed Brown } 7588a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 7595f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapPop(h,&j,&col)); 7608a07c6f1SJed Brown while (j >= 0) { 7618a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 7620298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 7635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space)); 7648a07c6f1SJed Brown ndouble++; 7658a07c6f1SJed Brown } 7668a07c6f1SJed Brown *(current_space->array++) = col; 7678a07c6f1SJed Brown current_space->local_used++; 7688a07c6f1SJed Brown current_space->local_remaining--; 7698a07c6f1SJed Brown ci[i+1]++; 7708a07c6f1SJed Brown 7718a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 7728a07c6f1SJed Brown for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 7738a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7748a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7755f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapAdd(h,j,bcol)); 7768a07c6f1SJed Brown bb[j]++; 7778a07c6f1SJed Brown break; 7788a07c6f1SJed Brown } 7798a07c6f1SJed Brown } 7805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapPop(h,&j,&col)); 7818a07c6f1SJed Brown } 7828a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7835f80ce2aSJacob Faibussowitsch for (; fptr<current_space->array; fptr++) CHKERRQ(PetscBTClear(bt,*fptr)); 7848a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7855f80ce2aSJacob Faibussowitsch CHKERRQ(PetscBTMemzero(bn,bt)); 7868a07c6f1SJed Brown } 7878a07c6f1SJed Brown } 7885f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(bb)); 7895f80ce2aSJacob Faibussowitsch CHKERRQ(PetscHeapDestroy(&h)); 7905f80ce2aSJacob Faibussowitsch CHKERRQ(PetscBTDestroy(&bt)); 7918a07c6f1SJed Brown 7928a07c6f1SJed Brown /* Column indices are in the list of free space */ 7938a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7948a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 7955f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(ci[am],&cj)); 7965f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFreeSpaceContiguous(&free_space,cj)); 7978a07c6f1SJed Brown 7988a07c6f1SJed Brown /* put together the new symbolic matrix */ 7995f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 8005f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(C,A,B)); 8018a07c6f1SJed Brown 8028a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 8038a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 8044222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 8058a07c6f1SJed Brown c->free_a = PETSC_TRUE; 8068a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 8078a07c6f1SJed Brown c->nonew = 0; 80826fbe8dcSKarl Rupp 8094222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 8108a07c6f1SJed Brown 8118a07c6f1SJed Brown /* set MatInfo */ 8128a07c6f1SJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 8138a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 8144222ddf1SHong Zhang C->info.mallocs = ndouble; 8154222ddf1SHong Zhang C->info.fill_ratio_given = fill; 8164222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 8178a07c6f1SJed Brown 8188a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 8198a07c6f1SJed Brown if (ci[am]) { 8205f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 8215f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 8228a07c6f1SJed Brown } else { 8235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Empty matrix product\n")); 8248a07c6f1SJed Brown } 8258a07c6f1SJed Brown #endif 8268a07c6f1SJed Brown PetscFunctionReturn(0); 8278a07c6f1SJed Brown } 8288a07c6f1SJed Brown 8294222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C) 830d7ed1a4dSandi selinger { 831d7ed1a4dSandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 832d7ed1a4dSandi selinger const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 833d7ed1a4dSandi selinger PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 834d7ed1a4dSandi selinger PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 835d7ed1a4dSandi selinger const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 836d7ed1a4dSandi selinger const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 837d7ed1a4dSandi selinger const PetscInt *brow_ptr[8],*brow_end[8]; 838d7ed1a4dSandi selinger PetscInt window[8]; 839d7ed1a4dSandi selinger PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 840d7ed1a4dSandi selinger PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 841d7ed1a4dSandi selinger PetscReal afill; 842f83700f2Sandi selinger PetscInt *workj_L1,*workj_L2,*workj_L3; 8437660c4dbSandi selinger PetscInt L1_nnz,L2_nnz; 844d7ed1a4dSandi selinger 845d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 846d7ed1a4dSandi selinger Because of the way virtual memory works, 847d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 848d7ed1a4dSandi selinger PetscFunctionBegin; 8495f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(am+1,&ci)); 850d7ed1a4dSandi selinger for (i=0; i<am; i++) { 851d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 852d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 853d7ed1a4dSandi selinger a_rownnz = 0; 854d7ed1a4dSandi selinger for (k=0; k<anzi; ++k) { 855d7ed1a4dSandi selinger a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 856d7ed1a4dSandi selinger if (a_rownnz > bn) { 857d7ed1a4dSandi selinger a_rownnz = bn; 858d7ed1a4dSandi selinger break; 859d7ed1a4dSandi selinger } 860d7ed1a4dSandi selinger } 861d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 862d7ed1a4dSandi selinger } 863d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 8645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(a_maxrownnz*8,&workj_L1)); 8655f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(a_maxrownnz*8,&workj_L2)); 8665f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(a_maxrownnz,&workj_L3)); 867d7ed1a4dSandi selinger 8687660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 8697660c4dbSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 870d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 8715f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(c_maxmem,&cj)); 872d7ed1a4dSandi selinger 873d7ed1a4dSandi selinger ci_nnz = 0; 874d7ed1a4dSandi selinger ci[0] = 0; 875d7ed1a4dSandi selinger worki_L1[0] = 0; 876d7ed1a4dSandi selinger worki_L2[0] = 0; 877d7ed1a4dSandi selinger for (i=0; i<am; i++) { 878d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 879d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 880d7ed1a4dSandi selinger rowsleft = anzi; 881d7ed1a4dSandi selinger inputcol_L1 = acol; 882d7ed1a4dSandi selinger L2_nnz = 0; 8837660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8847660c4dbSandi selinger worki_L2[1] = 0; 88508fe1b3cSKarl Rupp outputi_nnz = 0; 886d7ed1a4dSandi selinger 887d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 888d7ed1a4dSandi selinger while (ci_nnz+a_maxrownnz > c_maxmem) { 889d7ed1a4dSandi selinger c_maxmem *= 2; 890d7ed1a4dSandi selinger ndouble++; 8915f80ce2aSJacob Faibussowitsch CHKERRQ(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj)); 892d7ed1a4dSandi selinger } 893d7ed1a4dSandi selinger 894d7ed1a4dSandi selinger while (rowsleft) { 895d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 896d7ed1a4dSandi selinger L1_nrows = 0; 8977660c4dbSandi selinger L1_nnz = 0; 898d7ed1a4dSandi selinger inputcol = inputcol_L1; 899d7ed1a4dSandi selinger inputi = bi; 900d7ed1a4dSandi selinger inputj = bj; 901d7ed1a4dSandi selinger 902d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 903d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 904f83700f2Sandi selinger Input: inputj inputi inputcol bn 905d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 906d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 907d7ed1a4dSandi selinger window_min = bn; \ 9087660c4dbSandi selinger outputi_nnz = 0; \ 9097660c4dbSandi selinger for (k=0; k<ANNZ; ++k) { \ 910d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 911d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 912d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 913d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 914d7ed1a4dSandi selinger } \ 915d7ed1a4dSandi selinger while (window_min < bn) { \ 916d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 917d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 918d7ed1a4dSandi selinger old_window_min = window_min; \ 919d7ed1a4dSandi selinger window_min = bn; \ 920d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 921d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 922d7ed1a4dSandi selinger brow_ptr[k]++; \ 923d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 924d7ed1a4dSandi selinger } \ 925d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 926d7ed1a4dSandi selinger } \ 927d7ed1a4dSandi selinger } 928d7ed1a4dSandi selinger 929d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 930d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 931d7ed1a4dSandi selinger while (L1_rowsleft) { 9327660c4dbSandi selinger outputi_nnz = 0; 9337660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 9347660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 9357660c4dbSandi selinger 936d7ed1a4dSandi selinger switch (L1_rowsleft) { 937d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 938d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 939d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 940d7ed1a4dSandi selinger inputcol += L1_rowsleft; 941d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 942d7ed1a4dSandi selinger L1_rowsleft = 0; 943d7ed1a4dSandi selinger break; 944d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); 945d7ed1a4dSandi selinger inputcol += L1_rowsleft; 946d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 947d7ed1a4dSandi selinger L1_rowsleft = 0; 948d7ed1a4dSandi selinger break; 949d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); 950d7ed1a4dSandi selinger inputcol += L1_rowsleft; 951d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 952d7ed1a4dSandi selinger L1_rowsleft = 0; 953d7ed1a4dSandi selinger break; 954d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); 955d7ed1a4dSandi selinger inputcol += L1_rowsleft; 956d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 957d7ed1a4dSandi selinger L1_rowsleft = 0; 958d7ed1a4dSandi selinger break; 959d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); 960d7ed1a4dSandi selinger inputcol += L1_rowsleft; 961d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 962d7ed1a4dSandi selinger L1_rowsleft = 0; 963d7ed1a4dSandi selinger break; 964d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); 965d7ed1a4dSandi selinger inputcol += L1_rowsleft; 966d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 967d7ed1a4dSandi selinger L1_rowsleft = 0; 968d7ed1a4dSandi selinger break; 969d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); 970d7ed1a4dSandi selinger inputcol += L1_rowsleft; 971d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 972d7ed1a4dSandi selinger L1_rowsleft = 0; 973d7ed1a4dSandi selinger break; 974d7ed1a4dSandi selinger default: MatMatMultSymbolic_RowMergeMacro(8); 975d7ed1a4dSandi selinger inputcol += 8; 976d7ed1a4dSandi selinger rowsleft -= 8; 977d7ed1a4dSandi selinger L1_rowsleft -= 8; 978d7ed1a4dSandi selinger break; 979d7ed1a4dSandi selinger } 980d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9817660c4dbSandi selinger L1_nnz += outputi_nnz; 9827660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 983d7ed1a4dSandi selinger } 984d7ed1a4dSandi selinger 985d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 986d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 987d7ed1a4dSandi selinger if (anzi > 8) { 988d7ed1a4dSandi selinger inputi = worki_L1; 989d7ed1a4dSandi selinger inputj = workj_L1; 990d7ed1a4dSandi selinger inputcol = workcol; 991d7ed1a4dSandi selinger outputi_nnz = 0; 992d7ed1a4dSandi selinger 993d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 994d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 995d7ed1a4dSandi selinger 996d7ed1a4dSandi selinger switch (L1_nrows) { 997d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 998d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 999d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1000d7ed1a4dSandi selinger break; 1001d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1002d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1003d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1004d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1005d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1006d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1007d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1008d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1009d7ed1a4dSandi selinger } 1010d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 1011d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 1012d7ed1a4dSandi selinger 10137660c4dbSandi selinger /************************ L E V E L 3 **********************/ 10147660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 1015d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1016d7ed1a4dSandi selinger inputi = worki_L2; 1017d7ed1a4dSandi selinger inputj = workj_L2; 1018d7ed1a4dSandi selinger inputcol = workcol; 1019d7ed1a4dSandi selinger outputi_nnz = 0; 1020f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 1021d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 1022d7ed1a4dSandi selinger switch (L2_nrows) { 1023d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1024d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 1025d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1026d7ed1a4dSandi selinger break; 1027d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1028d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1029d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1030d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1031d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1032d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1033d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1034d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1035d7ed1a4dSandi selinger } 1036d7ed1a4dSandi selinger L2_nrows = 1; 10377660c4dbSandi selinger L2_nnz = outputi_nnz; 10387660c4dbSandi selinger worki_L2[1] = outputi_nnz; 10397660c4dbSandi selinger /* Copy to workj_L2 */ 1040d7ed1a4dSandi selinger if (rowsleft) { 10417660c4dbSandi selinger for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1042d7ed1a4dSandi selinger } 1043d7ed1a4dSandi selinger } 1044d7ed1a4dSandi selinger } 1045d7ed1a4dSandi selinger } /* while (rowsleft) */ 1046d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 1047d7ed1a4dSandi selinger 1048d7ed1a4dSandi selinger /* terminate current row */ 1049d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 1050d7ed1a4dSandi selinger ci[i+1] = ci_nnz; 1051d7ed1a4dSandi selinger } 1052d7ed1a4dSandi selinger 1053d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 10545f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 10555f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(C,A,B)); 1056d7ed1a4dSandi selinger 1057d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1058d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 10594222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1060d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 1061d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 1062d7ed1a4dSandi selinger c->nonew = 0; 1063d7ed1a4dSandi selinger 10644222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1065d7ed1a4dSandi selinger 1066d7ed1a4dSandi selinger /* set MatInfo */ 1067d7ed1a4dSandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1068d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 10694222ddf1SHong Zhang C->info.mallocs = ndouble; 10704222ddf1SHong Zhang C->info.fill_ratio_given = fill; 10714222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1072d7ed1a4dSandi selinger 1073d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1074d7ed1a4dSandi selinger if (ci[am]) { 10755f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 10765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 1077d7ed1a4dSandi selinger } else { 10785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Empty matrix product\n")); 1079d7ed1a4dSandi selinger } 1080d7ed1a4dSandi selinger #endif 1081d7ed1a4dSandi selinger 1082d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 10835f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(workj_L1)); 10845f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(workj_L2)); 10855f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(workj_L3)); 1086d7ed1a4dSandi selinger PetscFunctionReturn(0); 1087d7ed1a4dSandi selinger } 1088d7ed1a4dSandi selinger 1089cd093f1dSJed Brown /* concatenate unique entries and then sort */ 10904222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C) 1091cd093f1dSJed Brown { 1092cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1093cd093f1dSJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 10948c78258cSHong Zhang PetscInt *ci,*cj,bcol; 1095cd093f1dSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1096cd093f1dSJed Brown PetscReal afill; 1097cd093f1dSJed Brown PetscInt i,j,ndouble = 0; 1098cd093f1dSJed Brown PetscSegBuffer seg,segrow; 1099cd093f1dSJed Brown char *seen; 1100cd093f1dSJed Brown 1101cd093f1dSJed Brown PetscFunctionBegin; 11025f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(am+1,&ci)); 1103cd093f1dSJed Brown ci[0] = 0; 1104cd093f1dSJed Brown 1105cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 11065f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg)); 11075f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow)); 11085f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc1(bn,&seen)); 1109cd093f1dSJed Brown 1110cd093f1dSJed Brown /* Determine ci and cj */ 1111cd093f1dSJed Brown for (i=0; i<am; i++) { 1112cd093f1dSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1113cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1114cd093f1dSJed Brown PetscInt packlen = 0,*PETSC_RESTRICT crow; 11158c78258cSHong Zhang 1116cd093f1dSJed Brown /* Pack segrow */ 1117cd093f1dSJed Brown for (j=0; j<anzi; j++) { 1118cd093f1dSJed Brown PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1119cd093f1dSJed Brown for (k=bjstart; k<bjend; k++) { 11208c78258cSHong Zhang bcol = bj[k]; 1121cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1122cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 11235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferGetInts(segrow,1,&slot)); 1124cd093f1dSJed Brown *slot = bcol; 1125cd093f1dSJed Brown seen[bcol] = 1; 1126cd093f1dSJed Brown packlen++; 1127cd093f1dSJed Brown } 1128cd093f1dSJed Brown } 1129cd093f1dSJed Brown } 11308c78258cSHong Zhang 11318c78258cSHong Zhang /* Check i-th diagonal entry */ 11328c78258cSHong Zhang if (C->force_diagonals && !seen[i]) { 11338c78258cSHong Zhang PetscInt *PETSC_RESTRICT slot; 11345f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferGetInts(segrow,1,&slot)); 11358c78258cSHong Zhang *slot = i; 11368c78258cSHong Zhang seen[i] = 1; 11378c78258cSHong Zhang packlen++; 11388c78258cSHong Zhang } 11398c78258cSHong Zhang 11405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferGetInts(seg,packlen,&crow)); 11415f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferExtractTo(segrow,crow)); 11425f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSortInt(packlen,crow)); 1143cd093f1dSJed Brown ci[i+1] = ci[i] + packlen; 1144cd093f1dSJed Brown for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1145cd093f1dSJed Brown } 11465f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferDestroy(&segrow)); 11475f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(seen)); 1148cd093f1dSJed Brown 1149cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 11505f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferExtractAlloc(seg,&cj)); 11515f80ce2aSJacob Faibussowitsch CHKERRQ(PetscSegBufferDestroy(&seg)); 1152cd093f1dSJed Brown 1153cd093f1dSJed Brown /* put together the new symbolic matrix */ 11545f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C)); 11555f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(C,A,B)); 1156cd093f1dSJed Brown 1157cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1158cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 11594222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1160cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1161cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1162cd093f1dSJed Brown c->nonew = 0; 1163cd093f1dSJed Brown 11644222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1165cd093f1dSJed Brown 1166cd093f1dSJed Brown /* set MatInfo */ 11672a09556fSStefano Zampini afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5; 1168cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 11694222ddf1SHong Zhang C->info.mallocs = ndouble; 11704222ddf1SHong Zhang C->info.fill_ratio_given = fill; 11714222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1172cd093f1dSJed Brown 1173cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1174cd093f1dSJed Brown if (ci[am]) { 11755f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill)); 11765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill)); 1177cd093f1dSJed Brown } else { 11785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Empty matrix product\n")); 1179cd093f1dSJed Brown } 1180cd093f1dSJed Brown #endif 1181cd093f1dSJed Brown PetscFunctionReturn(0); 1182cd093f1dSJed Brown } 1183cd093f1dSJed Brown 11846718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 11852128a86cSHong Zhang { 11866718818eSStefano Zampini Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data; 11872128a86cSHong Zhang 11882128a86cSHong Zhang PetscFunctionBegin; 11895f80ce2aSJacob Faibussowitsch CHKERRQ(MatTransposeColoringDestroy(&abt->matcoloring)); 11905f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&abt->Bt_den)); 11915f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&abt->ABt_den)); 11925f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(abt)); 11932128a86cSHong Zhang PetscFunctionReturn(0); 11942128a86cSHong Zhang } 11952128a86cSHong Zhang 11964222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1197bc011b1eSHong Zhang { 119881d82fe4SHong Zhang Mat Bt; 119981d82fe4SHong Zhang PetscInt *bti,*btj; 120040192850SHong Zhang Mat_MatMatTransMult *abt; 12014222ddf1SHong Zhang Mat_Product *product = C->product; 12026718818eSStefano Zampini char *alg; 1203d2853540SHong Zhang 120481d82fe4SHong Zhang PetscFunctionBegin; 1205*28b400f6SJacob Faibussowitsch PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 1206*28b400f6SJacob Faibussowitsch PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 12076718818eSStefano Zampini 120881d82fe4SHong Zhang /* create symbolic Bt */ 12095f80ce2aSJacob Faibussowitsch CHKERRQ(MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj)); 12105f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt)); 12115f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs))); 12125f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(Bt,((PetscObject)A)->type_name)); 121381d82fe4SHong Zhang 121481d82fe4SHong Zhang /* get symbolic C=A*Bt */ 12155f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(product->alg,&alg)); 12165f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */ 12175f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C)); 12185f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */ 12195f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(alg)); 122081d82fe4SHong Zhang 1221a5b23f4aSJose E. Roman /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 12225f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&abt)); 12232128a86cSHong Zhang 12246718818eSStefano Zampini product->data = abt; 12256718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 12266718818eSStefano Zampini 12274222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 12282128a86cSHong Zhang 12294222ddf1SHong Zhang abt->usecoloring = PETSC_FALSE; 12305f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(product->alg,"color",&abt->usecoloring)); 123140192850SHong Zhang if (abt->usecoloring) { 1232b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1233b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1234335efc43SPeter Brune MatColoring coloring; 12352128a86cSHong Zhang ISColoring iscoloring; 12362128a86cSHong Zhang Mat Bt_dense,C_dense; 1237e8354b3eSHong Zhang 12384222ddf1SHong Zhang /* inode causes memory problem */ 12395f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE)); 12404222ddf1SHong Zhang 12415f80ce2aSJacob Faibussowitsch CHKERRQ(MatColoringCreate(C,&coloring)); 12425f80ce2aSJacob Faibussowitsch CHKERRQ(MatColoringSetDistance(coloring,2)); 12435f80ce2aSJacob Faibussowitsch CHKERRQ(MatColoringSetType(coloring,MATCOLORINGSL)); 12445f80ce2aSJacob Faibussowitsch CHKERRQ(MatColoringSetFromOptions(coloring)); 12455f80ce2aSJacob Faibussowitsch CHKERRQ(MatColoringApply(coloring,&iscoloring)); 12465f80ce2aSJacob Faibussowitsch CHKERRQ(MatColoringDestroy(&coloring)); 12475f80ce2aSJacob Faibussowitsch CHKERRQ(MatTransposeColoringCreate(C,iscoloring,&matcoloring)); 12482205254eSKarl Rupp 124940192850SHong Zhang abt->matcoloring = matcoloring; 12502205254eSKarl Rupp 12515f80ce2aSJacob Faibussowitsch CHKERRQ(ISColoringDestroy(&iscoloring)); 12522128a86cSHong Zhang 12532128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 12545f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PETSC_COMM_SELF,&Bt_dense)); 12555f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors)); 12565f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(Bt_dense,MATSEQDENSE)); 12575f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqDenseSetPreallocation(Bt_dense,NULL)); 12582205254eSKarl Rupp 12592128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 126040192850SHong Zhang abt->Bt_den = Bt_dense; 12612128a86cSHong Zhang 12625f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PETSC_COMM_SELF,&C_dense)); 12635f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors)); 12645f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(C_dense,MATSEQDENSE)); 12655f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqDenseSetPreallocation(C_dense,NULL)); 12662205254eSKarl Rupp 12672128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 126840192850SHong Zhang abt->ABt_den = C_dense; 1269f94ccd6cSHong Zhang 1270f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12711ce71dffSSatish Balay { 12724222ddf1SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 12735f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors))))); 12741ce71dffSSatish Balay } 1275f94ccd6cSHong Zhang #endif 12762128a86cSHong Zhang } 1277e99be685SHong Zhang /* clean up */ 12785f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&Bt)); 12795f80ce2aSJacob Faibussowitsch CHKERRQ(MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj)); 12805df89d91SHong Zhang PetscFunctionReturn(0); 12815df89d91SHong Zhang } 12825df89d91SHong Zhang 12836fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 12845df89d91SHong Zhang { 12855df89d91SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1286e2cac8caSJed Brown PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 128781d82fe4SHong Zhang PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 12885df89d91SHong Zhang PetscLogDouble flops=0.0; 1289aa1aec99SHong Zhang MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 12906718818eSStefano Zampini Mat_MatMatTransMult *abt; 12916718818eSStefano Zampini Mat_Product *product = C->product; 12925df89d91SHong Zhang 12935df89d91SHong Zhang PetscFunctionBegin; 1294*28b400f6SJacob Faibussowitsch PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 12956718818eSStefano Zampini abt = (Mat_MatMatTransMult *)product->data; 1296*28b400f6SJacob Faibussowitsch PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 129758ed3ceaSHong Zhang /* clear old values in C */ 129858ed3ceaSHong Zhang if (!c->a) { 12995f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc1(ci[cm]+1,&ca)); 130058ed3ceaSHong Zhang c->a = ca; 130158ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 130258ed3ceaSHong Zhang } else { 130358ed3ceaSHong Zhang ca = c->a; 13045f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(ca,ci[cm]+1)); 130558ed3ceaSHong Zhang } 130658ed3ceaSHong Zhang 130740192850SHong Zhang if (abt->usecoloring) { 130840192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 130940192850SHong Zhang Mat Bt_dense,C_dense = abt->ABt_den; 1310c8db22f6SHong Zhang 1311b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 131240192850SHong Zhang Bt_dense = abt->Bt_den; 13135f80ce2aSJacob Faibussowitsch CHKERRQ(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense)); 1314c8db22f6SHong Zhang 1315c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 13165f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense)); 1317c8db22f6SHong Zhang 13182128a86cSHong Zhang /* Recover C from C_dense */ 13195f80ce2aSJacob Faibussowitsch CHKERRQ(MatTransColoringApplyDenToSp(matcoloring,C_dense,C)); 1320c8db22f6SHong Zhang PetscFunctionReturn(0); 1321c8db22f6SHong Zhang } 1322c8db22f6SHong Zhang 13231fa1209cSHong Zhang for (i=0; i<cm; i++) { 132481d82fe4SHong Zhang anzi = ai[i+1] - ai[i]; 132581d82fe4SHong Zhang acol = aj + ai[i]; 13266973769bSHong Zhang aval = aa + ai[i]; 13271fa1209cSHong Zhang cnzi = ci[i+1] - ci[i]; 13281fa1209cSHong Zhang ccol = cj + ci[i]; 13296973769bSHong Zhang cval = ca + ci[i]; 13301fa1209cSHong Zhang for (j=0; j<cnzi; j++) { 133181d82fe4SHong Zhang brow = ccol[j]; 133281d82fe4SHong Zhang bnzj = bi[brow+1] - bi[brow]; 133381d82fe4SHong Zhang bcol = bj + bi[brow]; 13346973769bSHong Zhang bval = ba + bi[brow]; 13356973769bSHong Zhang 133681d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 133781d82fe4SHong Zhang nexta = 0; nextb = 0; 133881d82fe4SHong Zhang while (nexta<anzi && nextb<bnzj) { 13397b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 134081d82fe4SHong Zhang if (nexta == anzi) break; 13417b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 134281d82fe4SHong Zhang if (nextb == bnzj) break; 134381d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 13446973769bSHong Zhang cval[j] += aval[nexta]*bval[nextb]; 134581d82fe4SHong Zhang nexta++; nextb++; 134681d82fe4SHong Zhang flops += 2; 13471fa1209cSHong Zhang } 13481fa1209cSHong Zhang } 134981d82fe4SHong Zhang } 135081d82fe4SHong Zhang } 13515f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 13525f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 13535f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogFlops(flops)); 13545df89d91SHong Zhang PetscFunctionReturn(0); 13555df89d91SHong Zhang } 13565df89d91SHong Zhang 13576718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 13586d373c3eSHong Zhang { 13596718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data; 13606d373c3eSHong Zhang 13616d373c3eSHong Zhang PetscFunctionBegin; 13625f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&atb->At)); 13636718818eSStefano Zampini if (atb->destroy) { 13645f80ce2aSJacob Faibussowitsch CHKERRQ((*atb->destroy)(atb->data)); 13656473ade5SStefano Zampini } 13665f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(atb)); 13676d373c3eSHong Zhang PetscFunctionReturn(0); 13686d373c3eSHong Zhang } 13696d373c3eSHong Zhang 13704222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 13715df89d91SHong Zhang { 1372089a957eSStefano Zampini Mat At = NULL; 137338baddfdSBarry Smith PetscInt *ati,*atj; 13744222ddf1SHong Zhang Mat_Product *product = C->product; 1375089a957eSStefano Zampini PetscBool flg,def,square; 1376bc011b1eSHong Zhang 1377bc011b1eSHong Zhang PetscFunctionBegin; 1378089a957eSStefano Zampini MatCheckProduct(C,4); 1379089a957eSStefano Zampini square = (PetscBool)(A == B && A->symmetric && A->symmetric_set); 13804222ddf1SHong Zhang /* outerproduct */ 13815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(product->alg,"outerproduct",&flg)); 13824222ddf1SHong Zhang if (flg) { 1383bc011b1eSHong Zhang /* create symbolic At */ 1384089a957eSStefano Zampini if (!square) { 13855f80ce2aSJacob Faibussowitsch CHKERRQ(MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj)); 13865f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At)); 13875f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs))); 13885f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(At,((PetscObject)A)->type_name)); 1389089a957eSStefano Zampini } 1390bc011b1eSHong Zhang /* get symbolic C=At*B */ 13915f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,"sorted")); 13925f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C)); 1393bc011b1eSHong Zhang 1394bc011b1eSHong Zhang /* clean up */ 1395089a957eSStefano Zampini if (!square) { 13965f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&At)); 13975f80ce2aSJacob Faibussowitsch CHKERRQ(MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj)); 1398089a957eSStefano Zampini } 13996d373c3eSHong Zhang 14004222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 14015f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,"outerproduct")); 14024222ddf1SHong Zhang PetscFunctionReturn(0); 14034222ddf1SHong Zhang } 14044222ddf1SHong Zhang 14054222ddf1SHong Zhang /* matmatmult */ 14065f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(product->alg,"default",&def)); 14075f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(product->alg,"at*b",&flg)); 1408089a957eSStefano Zampini if (flg || def) { 14094222ddf1SHong Zhang Mat_MatTransMatMult *atb; 14104222ddf1SHong Zhang 1411*28b400f6SJacob Faibussowitsch PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 14125f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&atb)); 1413089a957eSStefano Zampini if (!square) { 14145f80ce2aSJacob Faibussowitsch CHKERRQ(MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At)); 1415089a957eSStefano Zampini } 14165f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,"sorted")); 14175f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C)); 14185f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,"at*b")); 14196718818eSStefano Zampini product->data = atb; 14206718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 14214222ddf1SHong Zhang atb->At = At; 14224222ddf1SHong Zhang atb->updateAt = PETSC_FALSE; /* because At is computed here */ 14234222ddf1SHong Zhang 14244222ddf1SHong Zhang C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 14254222ddf1SHong Zhang PetscFunctionReturn(0); 14264222ddf1SHong Zhang } 14274222ddf1SHong Zhang 14284222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1429bc011b1eSHong Zhang } 1430bc011b1eSHong Zhang 143175648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1432bc011b1eSHong Zhang { 14330fbc74f4SHong Zhang Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1434d0f46423SBarry Smith PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1435d0f46423SBarry Smith PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1436d13dce4bSSatish Balay PetscLogDouble flops=0.0; 1437aa1aec99SHong Zhang MatScalar *aa=a->a,*ba,*ca,*caj; 1438bc011b1eSHong Zhang 1439bc011b1eSHong Zhang PetscFunctionBegin; 1440aa1aec99SHong Zhang if (!c->a) { 14415f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc1(ci[cm]+1,&ca)); 14422205254eSKarl Rupp 1443aa1aec99SHong Zhang c->a = ca; 1444aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1445aa1aec99SHong Zhang } else { 1446aa1aec99SHong Zhang ca = c->a; 14475f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(ca,ci[cm])); 1448aa1aec99SHong Zhang } 1449bc011b1eSHong Zhang 1450bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1451bc011b1eSHong Zhang for (i=0; i<am; i++) { 1452bc011b1eSHong Zhang bj = b->j + bi[i]; 1453bc011b1eSHong Zhang ba = b->a + bi[i]; 1454bc011b1eSHong Zhang bnzi = bi[i+1] - bi[i]; 1455bc011b1eSHong Zhang anzi = ai[i+1] - ai[i]; 1456bc011b1eSHong Zhang for (j=0; j<anzi; j++) { 1457bc011b1eSHong Zhang nextb = 0; 14580fbc74f4SHong Zhang crow = *aj++; 1459bc011b1eSHong Zhang cjj = cj + ci[crow]; 1460bc011b1eSHong Zhang caj = ca + ci[crow]; 1461bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1462bc011b1eSHong Zhang for (k=0; nextb<bnzi; k++) { 14630fbc74f4SHong Zhang if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 14640fbc74f4SHong Zhang caj[k] += (*aa)*(*(ba+nextb)); 1465bc011b1eSHong Zhang nextb++; 1466bc011b1eSHong Zhang } 1467bc011b1eSHong Zhang } 1468bc011b1eSHong Zhang flops += 2*bnzi; 14690fbc74f4SHong Zhang aa++; 1470bc011b1eSHong Zhang } 1471bc011b1eSHong Zhang } 1472bc011b1eSHong Zhang 1473bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 14745f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 14755f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 14765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogFlops(flops)); 1477bc011b1eSHong Zhang PetscFunctionReturn(0); 1478bc011b1eSHong Zhang } 14799123193aSHong Zhang 14804222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 14819123193aSHong Zhang { 14829123193aSHong Zhang PetscFunctionBegin; 14835f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); 14844222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 14859123193aSHong Zhang PetscFunctionReturn(0); 14869123193aSHong Zhang } 14879123193aSHong Zhang 148893aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add) 14899123193aSHong Zhang { 1490f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 14911ca9667aSStefano Zampini PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4; 1492a4af7ca8SStefano Zampini const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av; 1493daf57211SHong Zhang const PetscInt *aj; 149475f6d85dSStefano Zampini PetscInt cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n; 149575f6d85dSStefano Zampini PetscInt clda; 149675f6d85dSStefano Zampini PetscInt am4,bm4,col,i,j,n; 14979123193aSHong Zhang 14989123193aSHong Zhang PetscFunctionBegin; 1499f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 15005f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJGetArrayRead(A,&av)); 150193aa15f2SStefano Zampini if (add) { 15025f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetArray(C,&c)); 150393aa15f2SStefano Zampini } else { 15045f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetArrayWrite(C,&c)); 150593aa15f2SStefano Zampini } 15065f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetArrayRead(B,&b)); 15075f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetLDA(B,&bm)); 15085f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetLDA(C,&clda)); 150975f6d85dSStefano Zampini am4 = 4*clda; 151075f6d85dSStefano Zampini bm4 = 4*bm; 1511f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 15121ca9667aSStefano Zampini c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda; 15131ca9667aSStefano Zampini for (col=0; col<(cn/4)*4; col += 4) { /* over columns of C */ 15141ca9667aSStefano Zampini for (i=0; i<am; i++) { /* over rows of A in those columns */ 1515f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1516f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1517f32d5d43SBarry Smith aj = a->j + a->i[i]; 1518a4af7ca8SStefano Zampini aa = av + a->i[i]; 1519f32d5d43SBarry Smith for (j=0; j<n; j++) { 15201ca9667aSStefano Zampini const PetscScalar aatmp = aa[j]; 15211ca9667aSStefano Zampini const PetscInt ajtmp = aj[j]; 1522730858b9SHong Zhang r1 += aatmp*b1[ajtmp]; 1523730858b9SHong Zhang r2 += aatmp*b2[ajtmp]; 1524730858b9SHong Zhang r3 += aatmp*b3[ajtmp]; 1525730858b9SHong Zhang r4 += aatmp*b4[ajtmp]; 15269123193aSHong Zhang } 152793aa15f2SStefano Zampini if (add) { 152887753ddeSHong Zhang c1[i] += r1; 152987753ddeSHong Zhang c2[i] += r2; 153087753ddeSHong Zhang c3[i] += r3; 153187753ddeSHong Zhang c4[i] += r4; 153293aa15f2SStefano Zampini } else { 153393aa15f2SStefano Zampini c1[i] = r1; 153493aa15f2SStefano Zampini c2[i] = r2; 153593aa15f2SStefano Zampini c3[i] = r3; 153693aa15f2SStefano Zampini c4[i] = r4; 153793aa15f2SStefano Zampini } 1538f32d5d43SBarry Smith } 1539730858b9SHong Zhang b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1540730858b9SHong Zhang c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1541f32d5d43SBarry Smith } 154293aa15f2SStefano Zampini /* process remaining columns */ 154393aa15f2SStefano Zampini if (col != cn) { 154493aa15f2SStefano Zampini PetscInt rc = cn-col; 154593aa15f2SStefano Zampini 154693aa15f2SStefano Zampini if (rc == 1) { 154793aa15f2SStefano Zampini for (i=0; i<am; i++) { 1548f32d5d43SBarry Smith r1 = 0.0; 1549f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1550f32d5d43SBarry Smith aj = a->j + a->i[i]; 1551a4af7ca8SStefano Zampini aa = av + a->i[i]; 155293aa15f2SStefano Zampini for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]]; 155393aa15f2SStefano Zampini if (add) c1[i] += r1; 155493aa15f2SStefano Zampini else c1[i] = r1; 155593aa15f2SStefano Zampini } 155693aa15f2SStefano Zampini } else if (rc == 2) { 155793aa15f2SStefano Zampini for (i=0; i<am; i++) { 155893aa15f2SStefano Zampini r1 = r2 = 0.0; 155993aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 156093aa15f2SStefano Zampini aj = a->j + a->i[i]; 156193aa15f2SStefano Zampini aa = av + a->i[i]; 1562f32d5d43SBarry Smith for (j=0; j<n; j++) { 156393aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 156493aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 156593aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 156693aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 1567f32d5d43SBarry Smith } 156893aa15f2SStefano Zampini if (add) { 156987753ddeSHong Zhang c1[i] += r1; 157093aa15f2SStefano Zampini c2[i] += r2; 157193aa15f2SStefano Zampini } else { 157293aa15f2SStefano Zampini c1[i] = r1; 157393aa15f2SStefano Zampini c2[i] = r2; 1574f32d5d43SBarry Smith } 157593aa15f2SStefano Zampini } 157693aa15f2SStefano Zampini } else { 157793aa15f2SStefano Zampini for (i=0; i<am; i++) { 157893aa15f2SStefano Zampini r1 = r2 = r3 = 0.0; 157993aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 158093aa15f2SStefano Zampini aj = a->j + a->i[i]; 158193aa15f2SStefano Zampini aa = av + a->i[i]; 158293aa15f2SStefano Zampini for (j=0; j<n; j++) { 158393aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 158493aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 158593aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 158693aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 158793aa15f2SStefano Zampini r3 += aatmp*b3[ajtmp]; 158893aa15f2SStefano Zampini } 158993aa15f2SStefano Zampini if (add) { 159093aa15f2SStefano Zampini c1[i] += r1; 159193aa15f2SStefano Zampini c2[i] += r2; 159293aa15f2SStefano Zampini c3[i] += r3; 159393aa15f2SStefano Zampini } else { 159493aa15f2SStefano Zampini c1[i] = r1; 159593aa15f2SStefano Zampini c2[i] = r2; 159693aa15f2SStefano Zampini c3[i] = r3; 159793aa15f2SStefano Zampini } 159893aa15f2SStefano Zampini } 159993aa15f2SStefano Zampini } 1600f32d5d43SBarry Smith } 16015f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogFlops(cn*(2.0*a->nz))); 160293aa15f2SStefano Zampini if (add) { 16035f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseRestoreArray(C,&c)); 160493aa15f2SStefano Zampini } else { 16055f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseRestoreArrayWrite(C,&c)); 160693aa15f2SStefano Zampini } 16075f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseRestoreArrayRead(B,&b)); 16085f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJRestoreArrayRead(A,&av)); 1609f32d5d43SBarry Smith PetscFunctionReturn(0); 1610f32d5d43SBarry Smith } 1611f32d5d43SBarry Smith 161287753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1613f32d5d43SBarry Smith { 1614f32d5d43SBarry Smith PetscFunctionBegin; 16152c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 16162c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->rmap->n != C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 16172c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->cmap->n != C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 16184324174eSBarry Smith 16195f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE)); 16209123193aSHong Zhang PetscFunctionReturn(0); 16219123193aSHong Zhang } 1622b1683b59SHong Zhang 16234222ddf1SHong Zhang /* ------------------------------------------------------- */ 16244222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 16254222ddf1SHong Zhang { 16264222ddf1SHong Zhang PetscFunctionBegin; 16274222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 16284222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16294222ddf1SHong Zhang PetscFunctionReturn(0); 16304222ddf1SHong Zhang } 16314222ddf1SHong Zhang 16326718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat); 16336718818eSStefano Zampini 16344222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 16354222ddf1SHong Zhang { 16364222ddf1SHong Zhang PetscFunctionBegin; 16376718818eSStefano Zampini C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16384222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB; 16396718818eSStefano Zampini PetscFunctionReturn(0); 16406718818eSStefano Zampini } 16416718818eSStefano Zampini 16426718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 16436718818eSStefano Zampini { 16446718818eSStefano Zampini PetscFunctionBegin; 16456718818eSStefano Zampini C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16466718818eSStefano Zampini C->ops->productsymbolic = MatProductSymbolic_ABt; 16474222ddf1SHong Zhang PetscFunctionReturn(0); 16484222ddf1SHong Zhang } 16494222ddf1SHong Zhang 16504222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 16514222ddf1SHong Zhang { 16524222ddf1SHong Zhang Mat_Product *product = C->product; 16534222ddf1SHong Zhang 16544222ddf1SHong Zhang PetscFunctionBegin; 16554222ddf1SHong Zhang switch (product->type) { 16564222ddf1SHong Zhang case MATPRODUCT_AB: 16575f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 16584222ddf1SHong Zhang break; 16594222ddf1SHong Zhang case MATPRODUCT_AtB: 16605f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 16614222ddf1SHong Zhang break; 16626718818eSStefano Zampini case MATPRODUCT_ABt: 16635f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 16644222ddf1SHong Zhang break; 16654222ddf1SHong Zhang default: 16666718818eSStefano Zampini break; 16674222ddf1SHong Zhang } 16684222ddf1SHong Zhang PetscFunctionReturn(0); 16694222ddf1SHong Zhang } 16704222ddf1SHong Zhang /* ------------------------------------------------------- */ 16714222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 16724222ddf1SHong Zhang { 16734222ddf1SHong Zhang Mat_Product *product = C->product; 16744222ddf1SHong Zhang Mat A = product->A; 16754222ddf1SHong Zhang PetscBool baij; 16764222ddf1SHong Zhang 16774222ddf1SHong Zhang PetscFunctionBegin; 16785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij)); 16794222ddf1SHong Zhang if (!baij) { /* A is seqsbaij */ 16804222ddf1SHong Zhang PetscBool sbaij; 16815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij)); 1682*28b400f6SJacob Faibussowitsch PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format"); 16834222ddf1SHong Zhang 16844222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 16854222ddf1SHong Zhang } else { /* A is seqbaij */ 16864222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 16874222ddf1SHong Zhang } 16884222ddf1SHong Zhang 16894222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16904222ddf1SHong Zhang PetscFunctionReturn(0); 16914222ddf1SHong Zhang } 16924222ddf1SHong Zhang 16934222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 16944222ddf1SHong Zhang { 16954222ddf1SHong Zhang Mat_Product *product = C->product; 16964222ddf1SHong Zhang 16974222ddf1SHong Zhang PetscFunctionBegin; 16986718818eSStefano Zampini MatCheckProduct(C,1); 1699*28b400f6SJacob Faibussowitsch PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 17006718818eSStefano Zampini if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) { 17015f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 17026718818eSStefano Zampini } 17034222ddf1SHong Zhang PetscFunctionReturn(0); 17044222ddf1SHong Zhang } 17056718818eSStefano Zampini 17064222ddf1SHong Zhang /* ------------------------------------------------------- */ 17074222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 17084222ddf1SHong Zhang { 17094222ddf1SHong Zhang PetscFunctionBegin; 17104222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 17114222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17124222ddf1SHong Zhang PetscFunctionReturn(0); 17134222ddf1SHong Zhang } 17144222ddf1SHong Zhang 17154222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 17164222ddf1SHong Zhang { 17174222ddf1SHong Zhang Mat_Product *product = C->product; 17184222ddf1SHong Zhang 17194222ddf1SHong Zhang PetscFunctionBegin; 17204222ddf1SHong Zhang if (product->type == MATPRODUCT_AB) { 17215f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 17226718818eSStefano Zampini } 17234222ddf1SHong Zhang PetscFunctionReturn(0); 17244222ddf1SHong Zhang } 17254222ddf1SHong Zhang /* ------------------------------------------------------- */ 17264222ddf1SHong Zhang 1727b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1728c8db22f6SHong Zhang { 17292f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 17302f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 17312f5f1f90SHong Zhang PetscInt *bi = b->i,*bj=b->j; 17322f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 17332f5f1f90SHong Zhang MatScalar *btval,*btval_den,*ba=b->a; 17342f5f1f90SHong Zhang PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1735c8db22f6SHong Zhang 1736c8db22f6SHong Zhang PetscFunctionBegin; 17372f5f1f90SHong Zhang btval_den=btdense->v; 17385f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(btval_den,m*n)); 17392f5f1f90SHong Zhang for (k=0; k<ncolors; k++) { 17402f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 17412f5f1f90SHong Zhang for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1742d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 17432f5f1f90SHong Zhang btcol = bj + bi[col]; 17442f5f1f90SHong Zhang btval = ba + bi[col]; 17452f5f1f90SHong Zhang anz = bi[col+1] - bi[col]; 1746d2853540SHong Zhang for (j=0; j<anz; j++) { 17472f5f1f90SHong Zhang brow = btcol[j]; 17482f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1749c8db22f6SHong Zhang } 1750c8db22f6SHong Zhang } 17512f5f1f90SHong Zhang btval_den += m; 1752c8db22f6SHong Zhang } 1753c8db22f6SHong Zhang PetscFunctionReturn(0); 1754c8db22f6SHong Zhang } 1755c8db22f6SHong Zhang 1756b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 17578972f759SHong Zhang { 1758b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 17591683a169SBarry Smith const PetscScalar *ca_den,*ca_den_ptr; 17601683a169SBarry Smith PetscScalar *ca=csp->a; 1761f99a636bSHong Zhang PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1762e88777f2SHong Zhang PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1763077f23c2SHong Zhang PetscInt nrows,*row,*idx; 1764077f23c2SHong Zhang PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 17658972f759SHong Zhang 17668972f759SHong Zhang PetscFunctionBegin; 17675f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetArrayRead(Cden,&ca_den)); 1768f99a636bSHong Zhang 1769077f23c2SHong Zhang if (brows > 0) { 1770077f23c2SHong Zhang PetscInt *lstart,row_end,row_start; 1771980ae229SHong Zhang lstart = matcoloring->lstart; 17725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(lstart,ncolors)); 1773eeb4fd02SHong Zhang 1774077f23c2SHong Zhang row_end = brows; 1775eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1776077f23c2SHong Zhang for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1777077f23c2SHong Zhang ca_den_ptr = ca_den; 1778980ae229SHong Zhang for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1779eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1780eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1781eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1782eeb4fd02SHong Zhang for (l=lstart[k]; l<nrows; l++) { 1783eeb4fd02SHong Zhang if (row[l] >= row_end) { 1784eeb4fd02SHong Zhang lstart[k] = l; 1785eeb4fd02SHong Zhang break; 1786eeb4fd02SHong Zhang } else { 1787077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1788eeb4fd02SHong Zhang } 1789eeb4fd02SHong Zhang } 1790077f23c2SHong Zhang ca_den_ptr += m; 1791eeb4fd02SHong Zhang } 1792077f23c2SHong Zhang row_end += brows; 1793eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1794eeb4fd02SHong Zhang } 1795077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1796077f23c2SHong Zhang ca_den_ptr = ca_den; 1797077f23c2SHong Zhang for (k=0; k<ncolors; k++) { 1798077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1799077f23c2SHong Zhang row = rows + colorforrow[k]; 1800077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1801077f23c2SHong Zhang for (l=0; l<nrows; l++) { 1802077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1803077f23c2SHong Zhang } 1804077f23c2SHong Zhang ca_den_ptr += m; 1805077f23c2SHong Zhang } 1806f99a636bSHong Zhang } 1807f99a636bSHong Zhang 18085f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseRestoreArrayRead(Cden,&ca_den)); 1809f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1810077f23c2SHong Zhang if (matcoloring->brows > 0) { 18115f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows)); 1812e88777f2SHong Zhang } else { 18135f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1814e88777f2SHong Zhang } 1815f99a636bSHong Zhang #endif 18168972f759SHong Zhang PetscFunctionReturn(0); 18178972f759SHong Zhang } 18188972f759SHong Zhang 1819b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1820b1683b59SHong Zhang { 1821e88777f2SHong Zhang PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 18221a83f524SJed Brown const PetscInt *is,*ci,*cj,*row_idx; 1823b28d80bdSHong Zhang PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1824b1683b59SHong Zhang IS *isa; 1825b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1826e88777f2SHong Zhang PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1827e88777f2SHong Zhang PetscInt *colorforcol,*columns,*columns_i,brows; 1828e88777f2SHong Zhang PetscBool flg; 1829b1683b59SHong Zhang 1830b1683b59SHong Zhang PetscFunctionBegin; 18315f80ce2aSJacob Faibussowitsch CHKERRQ(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa)); 1832e99be685SHong Zhang 18337ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1834e88777f2SHong Zhang Nbs = mat->cmap->N/bs; 1835b1683b59SHong Zhang c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1836e88777f2SHong Zhang c->N = Nbs; 1837e88777f2SHong Zhang c->m = c->M; 1838b1683b59SHong Zhang c->rstart = 0; 1839e88777f2SHong Zhang c->brows = 100; 1840b1683b59SHong Zhang 1841b1683b59SHong Zhang c->ncolors = nis; 18425f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow)); 18435f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(csp->nz+1,&rows)); 18445f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(csp->nz+1,&den2sp)); 1845e88777f2SHong Zhang 1846e88777f2SHong Zhang brows = c->brows; 18475f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg)); 1848e88777f2SHong Zhang if (flg) c->brows = brows; 1849eeb4fd02SHong Zhang if (brows > 0) { 18505f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(nis+1,&c->lstart)); 1851e88777f2SHong Zhang } 18522205254eSKarl Rupp 1853d2853540SHong Zhang colorforrow[0] = 0; 1854d2853540SHong Zhang rows_i = rows; 1855f99a636bSHong Zhang den2sp_i = den2sp; 1856b1683b59SHong Zhang 18575f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(nis+1,&colorforcol)); 18585f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(Nbs+1,&columns)); 18592205254eSKarl Rupp 1860d2853540SHong Zhang colorforcol[0] = 0; 1861d2853540SHong Zhang columns_i = columns; 1862d2853540SHong Zhang 1863077f23c2SHong Zhang /* get column-wise storage of mat */ 18645f80ce2aSJacob Faibussowitsch CHKERRQ(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL)); 1865b1683b59SHong Zhang 1866b28d80bdSHong Zhang cm = c->m; 18675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(cm+1,&rowhit)); 18685f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(cm+1,&idxhit)); 1869f99a636bSHong Zhang for (i=0; i<nis; i++) { /* loop over color */ 18705f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetLocalSize(isa[i],&n)); 18715f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetIndices(isa[i],&is)); 18722205254eSKarl Rupp 1873b1683b59SHong Zhang c->ncolumns[i] = n; 1874b1683b59SHong Zhang if (n) { 18755f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(columns_i,is,n)); 1876b1683b59SHong Zhang } 1877d2853540SHong Zhang colorforcol[i+1] = colorforcol[i] + n; 1878d2853540SHong Zhang columns_i += n; 1879b1683b59SHong Zhang 1880b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 18815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(rowhit,cm)); 1882e99be685SHong Zhang 1883b7caf3d6SHong Zhang for (j=0; j<n; j++) { /* loop over columns*/ 1884b1683b59SHong Zhang col = is[j]; 1885d2853540SHong Zhang row_idx = cj + ci[col]; 1886b1683b59SHong Zhang m = ci[col+1] - ci[col]; 1887b7caf3d6SHong Zhang for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1888e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1889d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1890b1683b59SHong Zhang } 1891b1683b59SHong Zhang } 1892b1683b59SHong Zhang /* count the number of hits */ 1893b1683b59SHong Zhang nrows = 0; 1894e8354b3eSHong Zhang for (j=0; j<cm; j++) { 1895b1683b59SHong Zhang if (rowhit[j]) nrows++; 1896b1683b59SHong Zhang } 1897b1683b59SHong Zhang c->nrows[i] = nrows; 1898d2853540SHong Zhang colorforrow[i+1] = colorforrow[i] + nrows; 1899d2853540SHong Zhang 1900b1683b59SHong Zhang nrows = 0; 1901b7caf3d6SHong Zhang for (j=0; j<cm; j++) { /* loop over rows */ 1902b1683b59SHong Zhang if (rowhit[j]) { 1903d2853540SHong Zhang rows_i[nrows] = j; 190412b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1905b1683b59SHong Zhang nrows++; 1906b1683b59SHong Zhang } 1907b1683b59SHong Zhang } 1908e88777f2SHong Zhang den2sp_i += nrows; 1909077f23c2SHong Zhang 19105f80ce2aSJacob Faibussowitsch CHKERRQ(ISRestoreIndices(isa[i],&is)); 1911f99a636bSHong Zhang rows_i += nrows; 1912b1683b59SHong Zhang } 19135f80ce2aSJacob Faibussowitsch CHKERRQ(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL)); 19145f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(rowhit)); 19155f80ce2aSJacob Faibussowitsch CHKERRQ(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa)); 19162c71b3e2SJacob Faibussowitsch PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]); 1917b28d80bdSHong Zhang 1918d2853540SHong Zhang c->colorforrow = colorforrow; 1919d2853540SHong Zhang c->rows = rows; 1920f99a636bSHong Zhang c->den2sp = den2sp; 1921d2853540SHong Zhang c->colorforcol = colorforcol; 1922d2853540SHong Zhang c->columns = columns; 19232205254eSKarl Rupp 19245f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(idxhit)); 1925b1683b59SHong Zhang PetscFunctionReturn(0); 1926b1683b59SHong Zhang } 1927d013fc79Sandi selinger 19284222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19294222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1930df97dc6dSFande Kong { 19314222ddf1SHong Zhang Mat_Product *product = C->product; 19324222ddf1SHong Zhang Mat A=product->A,B=product->B; 19334222ddf1SHong Zhang 1934df97dc6dSFande Kong PetscFunctionBegin; 19354222ddf1SHong Zhang if (C->ops->mattransposemultnumeric) { 19364222ddf1SHong Zhang /* Alg: "outerproduct" */ 19375f80ce2aSJacob Faibussowitsch CHKERRQ((*C->ops->mattransposemultnumeric)(A,B,C)); 19384222ddf1SHong Zhang } else { 19394222ddf1SHong Zhang /* Alg: "matmatmult" -- C = At*B */ 19406718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 19416718818eSStefano Zampini Mat At; 19424222ddf1SHong Zhang 1943*28b400f6SJacob Faibussowitsch PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 19446718818eSStefano Zampini At = atb->At; 1945089a957eSStefano Zampini if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */ 19465f80ce2aSJacob Faibussowitsch CHKERRQ(MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At)); 19474222ddf1SHong Zhang } 19485f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C)); 19494222ddf1SHong Zhang atb->updateAt = PETSC_TRUE; 19504222ddf1SHong Zhang } 1951df97dc6dSFande Kong PetscFunctionReturn(0); 1952df97dc6dSFande Kong } 1953df97dc6dSFande Kong 19544222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1955d013fc79Sandi selinger { 19564222ddf1SHong Zhang Mat_Product *product = C->product; 19574222ddf1SHong Zhang Mat A=product->A,B=product->B; 19584222ddf1SHong Zhang PetscReal fill=product->fill; 1959d013fc79Sandi selinger 1960d013fc79Sandi selinger PetscFunctionBegin; 19615f80ce2aSJacob Faibussowitsch CHKERRQ(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C)); 19622869b61bSandi selinger 19634222ddf1SHong Zhang C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 19644222ddf1SHong Zhang PetscFunctionReturn(0); 19652869b61bSandi selinger } 1966d013fc79Sandi selinger 19674222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19684222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 19694222ddf1SHong Zhang { 19704222ddf1SHong Zhang PetscErrorCode ierr; 19714222ddf1SHong Zhang Mat_Product *product = C->product; 19724222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 19734222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 19744222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 19754222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 19764222ddf1SHong Zhang PetscInt nalg = 7; 19774222ddf1SHong Zhang #else 19784222ddf1SHong Zhang const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"}; 19794222ddf1SHong Zhang PetscInt nalg = 8; 19804222ddf1SHong Zhang #endif 19814222ddf1SHong Zhang 19824222ddf1SHong Zhang PetscFunctionBegin; 19834222ddf1SHong Zhang /* Set default algorithm */ 19845f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(C->product->alg,"default",&flg)); 19854222ddf1SHong Zhang if (flg) { 19865f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 1987d013fc79Sandi selinger } 1988d013fc79Sandi selinger 19894222ddf1SHong Zhang /* Get runtime option */ 19904222ddf1SHong Zhang if (product->api_user) { 19914222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 19925f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg)); 19934222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 19944222ddf1SHong Zhang } else { 19954222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 19965f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg)); 19974222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 1998d013fc79Sandi selinger } 19994222ddf1SHong Zhang if (flg) { 20005f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 2001d013fc79Sandi selinger } 2002d013fc79Sandi selinger 20034222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 20044222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 20054222ddf1SHong Zhang PetscFunctionReturn(0); 20064222ddf1SHong Zhang } 2007d013fc79Sandi selinger 20084222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 20094222ddf1SHong Zhang { 20104222ddf1SHong Zhang PetscErrorCode ierr; 20114222ddf1SHong Zhang Mat_Product *product = C->product; 20124222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20134222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 2014089a957eSStefano Zampini const char *algTypes[3] = {"default","at*b","outerproduct"}; 2015089a957eSStefano Zampini PetscInt nalg = 3; 2016d013fc79Sandi selinger 20174222ddf1SHong Zhang PetscFunctionBegin; 20184222ddf1SHong Zhang /* Get runtime option */ 20194222ddf1SHong Zhang if (product->api_user) { 20204222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 20215f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg)); 20224222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20234222ddf1SHong Zhang } else { 20244222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 20255f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg)); 20264222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20274222ddf1SHong Zhang } 20284222ddf1SHong Zhang if (flg) { 20295f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 20304222ddf1SHong Zhang } 2031d013fc79Sandi selinger 20324222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 20334222ddf1SHong Zhang PetscFunctionReturn(0); 20344222ddf1SHong Zhang } 20354222ddf1SHong Zhang 20364222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 20374222ddf1SHong Zhang { 20384222ddf1SHong Zhang PetscErrorCode ierr; 20394222ddf1SHong Zhang Mat_Product *product = C->product; 20404222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20414222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20424222ddf1SHong Zhang const char *algTypes[2] = {"default","color"}; 20434222ddf1SHong Zhang PetscInt nalg = 2; 20444222ddf1SHong Zhang 20454222ddf1SHong Zhang PetscFunctionBegin; 20464222ddf1SHong Zhang /* Set default algorithm */ 20475f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(C->product->alg,"default",&flg)); 20484222ddf1SHong Zhang if (!flg) { 20494222ddf1SHong Zhang alg = 1; 20505f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 20514222ddf1SHong Zhang } 20524222ddf1SHong Zhang 20534222ddf1SHong Zhang /* Get runtime option */ 20544222ddf1SHong Zhang if (product->api_user) { 20554222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr); 20565f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg)); 20574222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20584222ddf1SHong Zhang } else { 20594222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr); 20605f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg)); 20614222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20624222ddf1SHong Zhang } 20634222ddf1SHong Zhang if (flg) { 20645f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 20654222ddf1SHong Zhang } 20664222ddf1SHong Zhang 20674222ddf1SHong Zhang C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 20684222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABt; 20694222ddf1SHong Zhang PetscFunctionReturn(0); 20704222ddf1SHong Zhang } 20714222ddf1SHong Zhang 20724222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 20734222ddf1SHong Zhang { 20744222ddf1SHong Zhang PetscErrorCode ierr; 20754222ddf1SHong Zhang Mat_Product *product = C->product; 20764222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20774222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 20784222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 20794222ddf1SHong Zhang const char *algTypes[2] = {"scalable","rap"}; 20804222ddf1SHong Zhang PetscInt nalg = 2; 20814222ddf1SHong Zhang #else 20824222ddf1SHong Zhang const char *algTypes[3] = {"scalable","rap","hypre"}; 20834222ddf1SHong Zhang PetscInt nalg = 3; 20844222ddf1SHong Zhang #endif 20854222ddf1SHong Zhang 20864222ddf1SHong Zhang PetscFunctionBegin; 20874222ddf1SHong Zhang /* Set default algorithm */ 20885f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(product->alg,"default",&flg)); 20894222ddf1SHong Zhang if (flg) { 20905f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 20914222ddf1SHong Zhang } 20924222ddf1SHong Zhang 20934222ddf1SHong Zhang /* Get runtime option */ 20944222ddf1SHong Zhang if (product->api_user) { 20954222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 20965f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg)); 20974222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20984222ddf1SHong Zhang } else { 20994222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 21005f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg)); 21014222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21024222ddf1SHong Zhang } 21034222ddf1SHong Zhang if (flg) { 21045f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 21054222ddf1SHong Zhang } 21064222ddf1SHong Zhang 21074222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 21084222ddf1SHong Zhang PetscFunctionReturn(0); 21094222ddf1SHong Zhang } 21104222ddf1SHong Zhang 21114222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 21124222ddf1SHong Zhang { 21134222ddf1SHong Zhang PetscErrorCode ierr; 21144222ddf1SHong Zhang Mat_Product *product = C->product; 21154222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21164222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21174222ddf1SHong Zhang const char *algTypes[3] = {"r*a*rt","r*art","coloring_rart"}; 21184222ddf1SHong Zhang PetscInt nalg = 3; 21194222ddf1SHong Zhang 21204222ddf1SHong Zhang PetscFunctionBegin; 21214222ddf1SHong Zhang /* Set default algorithm */ 21225f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(product->alg,"default",&flg)); 21234222ddf1SHong Zhang if (flg) { 21245f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 21254222ddf1SHong Zhang } 21264222ddf1SHong Zhang 21274222ddf1SHong Zhang /* Get runtime option */ 21284222ddf1SHong Zhang if (product->api_user) { 21294222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 21305f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg)); 21314222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21324222ddf1SHong Zhang } else { 21334222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 21345f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg)); 21354222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21364222ddf1SHong Zhang } 21374222ddf1SHong Zhang if (flg) { 21385f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 21394222ddf1SHong Zhang } 21404222ddf1SHong Zhang 21414222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 21424222ddf1SHong Zhang PetscFunctionReturn(0); 21434222ddf1SHong Zhang } 21444222ddf1SHong Zhang 21454222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 21464222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 21474222ddf1SHong Zhang { 21484222ddf1SHong Zhang PetscErrorCode ierr; 21494222ddf1SHong Zhang Mat_Product *product = C->product; 21504222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21514222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21524222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 21534222ddf1SHong Zhang PetscInt nalg = 7; 21544222ddf1SHong Zhang 21554222ddf1SHong Zhang PetscFunctionBegin; 21564222ddf1SHong Zhang /* Set default algorithm */ 21575f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrcmp(product->alg,"default",&flg)); 21584222ddf1SHong Zhang if (flg) { 21595f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 21604222ddf1SHong Zhang } 21614222ddf1SHong Zhang 21624222ddf1SHong Zhang /* Get runtime option */ 21634222ddf1SHong Zhang if (product->api_user) { 21644222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 21655f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg)); 21664222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21674222ddf1SHong Zhang } else { 21684222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 21695f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg)); 21704222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21714222ddf1SHong Zhang } 21724222ddf1SHong Zhang if (flg) { 21735f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg])); 21744222ddf1SHong Zhang } 21754222ddf1SHong Zhang 21764222ddf1SHong Zhang C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 21774222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABC; 21784222ddf1SHong Zhang PetscFunctionReturn(0); 21794222ddf1SHong Zhang } 21804222ddf1SHong Zhang 21814222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 21824222ddf1SHong Zhang { 21834222ddf1SHong Zhang Mat_Product *product = C->product; 21844222ddf1SHong Zhang 21854222ddf1SHong Zhang PetscFunctionBegin; 21864222ddf1SHong Zhang switch (product->type) { 21874222ddf1SHong Zhang case MATPRODUCT_AB: 21885f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_AB(C)); 21894222ddf1SHong Zhang break; 21904222ddf1SHong Zhang case MATPRODUCT_AtB: 21915f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_AtB(C)); 21924222ddf1SHong Zhang break; 21934222ddf1SHong Zhang case MATPRODUCT_ABt: 21945f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_ABt(C)); 21954222ddf1SHong Zhang break; 21964222ddf1SHong Zhang case MATPRODUCT_PtAP: 21975f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 21984222ddf1SHong Zhang break; 21994222ddf1SHong Zhang case MATPRODUCT_RARt: 22005f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_RARt(C)); 22014222ddf1SHong Zhang break; 22024222ddf1SHong Zhang case MATPRODUCT_ABC: 22035f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_ABC(C)); 22044222ddf1SHong Zhang break; 22056718818eSStefano Zampini default: 22066718818eSStefano Zampini break; 22074222ddf1SHong Zhang } 2208d013fc79Sandi selinger PetscFunctionReturn(0); 2209d013fc79Sandi selinger } 2210