1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 14df97dc6dSFande Kong { 15df97dc6dSFande Kong PetscErrorCode ierr; 16df97dc6dSFande Kong 17df97dc6dSFande Kong PetscFunctionBegin; 18df97dc6dSFande Kong if (C->ops->matmultnumeric) { 192c71b3e2SJacob Faibussowitsch PetscCheckFalse(C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call"); 20df97dc6dSFande Kong ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr); 21df97dc6dSFande Kong } else { 22df97dc6dSFande Kong ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr); 23df97dc6dSFande Kong } 24df97dc6dSFande Kong PetscFunctionReturn(0); 25df97dc6dSFande Kong } 26df97dc6dSFande Kong 274222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */ 28e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat) 29df97dc6dSFande Kong { 30df97dc6dSFande Kong PetscErrorCode ierr; 314222ddf1SHong Zhang PetscInt ii; 324222ddf1SHong Zhang Mat_SeqAIJ *aij; 33*cbc6b225SStefano Zampini PetscBool isseqaij, osingle, ofree_a, ofree_ij; 345c66b693SKris Buschelman 355c66b693SKris Buschelman PetscFunctionBegin; 362c71b3e2SJacob Faibussowitsch PetscCheckFalse(m > 0 && i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0"); 374222ddf1SHong Zhang ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr); 384222ddf1SHong Zhang 39e4e71118SStefano Zampini if (!mtype) { 40e4e71118SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 41e4e71118SStefano Zampini if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); } 42e4e71118SStefano Zampini } else { 43e4e71118SStefano Zampini ierr = MatSetType(mat,mtype);CHKERRQ(ierr); 44e4e71118SStefano Zampini } 45*cbc6b225SStefano Zampini 464222ddf1SHong Zhang aij = (Mat_SeqAIJ*)(mat)->data; 47*cbc6b225SStefano Zampini osingle = aij->singlemalloc; 48*cbc6b225SStefano Zampini ofree_a = aij->free_a; 49*cbc6b225SStefano Zampini ofree_ij = aij->free_ij; 50*cbc6b225SStefano Zampini /* changes the free flags */ 51*cbc6b225SStefano Zampini ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 52*cbc6b225SStefano Zampini 53*cbc6b225SStefano Zampini ierr = PetscFree(aij->ilen);CHKERRQ(ierr); 54*cbc6b225SStefano Zampini ierr = PetscFree(aij->imax);CHKERRQ(ierr); 554222ddf1SHong Zhang ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr); 564222ddf1SHong Zhang ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr); 57*cbc6b225SStefano Zampini for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) { 58*cbc6b225SStefano Zampini const PetscInt rnz = i[ii+1] - i[ii]; 59*cbc6b225SStefano Zampini aij->nonzerorowcnt += !!rnz; 60*cbc6b225SStefano Zampini aij->rmax = PetscMax(aij->rmax,rnz); 61*cbc6b225SStefano Zampini aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii]; 62*cbc6b225SStefano Zampini } 63*cbc6b225SStefano Zampini aij->maxnz = i[m]; 64*cbc6b225SStefano Zampini aij->nz = i[m]; 654222ddf1SHong Zhang 66*cbc6b225SStefano Zampini if (osingle) { 67*cbc6b225SStefano Zampini ierr = PetscFree3(aij->a,aij->j,aij->i);CHKERRQ(ierr); 68*cbc6b225SStefano Zampini } else { 69*cbc6b225SStefano Zampini if (ofree_a) { ierr = PetscFree(aij->a);CHKERRQ(ierr); } 70*cbc6b225SStefano Zampini if (ofree_ij) { ierr = PetscFree(aij->j);CHKERRQ(ierr); } 71*cbc6b225SStefano Zampini if (ofree_ij) { ierr = PetscFree(aij->i);CHKERRQ(ierr); } 72*cbc6b225SStefano Zampini } 734222ddf1SHong Zhang aij->i = i; 744222ddf1SHong Zhang aij->j = j; 754222ddf1SHong Zhang aij->a = a; 764222ddf1SHong Zhang aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 77*cbc6b225SStefano Zampini /* default to not retain ownership */ 78*cbc6b225SStefano Zampini aij->singlemalloc = PETSC_FALSE; 794222ddf1SHong Zhang aij->free_a = PETSC_FALSE; 804222ddf1SHong Zhang aij->free_ij = PETSC_FALSE; 81*cbc6b225SStefano Zampini ierr = MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6);CHKERRQ(ierr); 825c66b693SKris Buschelman PetscFunctionReturn(0); 835c66b693SKris Buschelman } 841c24bd37SHong Zhang 854222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 864222ddf1SHong Zhang { 874222ddf1SHong Zhang PetscErrorCode ierr; 884222ddf1SHong Zhang Mat_Product *product = C->product; 894222ddf1SHong Zhang MatProductAlgorithm alg; 904222ddf1SHong Zhang PetscBool flg; 914222ddf1SHong Zhang 924222ddf1SHong Zhang PetscFunctionBegin; 934222ddf1SHong Zhang if (product) { 944222ddf1SHong Zhang alg = product->alg; 954222ddf1SHong Zhang } else { 964222ddf1SHong Zhang alg = "sorted"; 974222ddf1SHong Zhang } 984222ddf1SHong Zhang /* sorted */ 994222ddf1SHong Zhang ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr); 1004222ddf1SHong Zhang if (flg) { 1014222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr); 1024222ddf1SHong Zhang PetscFunctionReturn(0); 1034222ddf1SHong Zhang } 1044222ddf1SHong Zhang 1054222ddf1SHong Zhang /* scalable */ 1064222ddf1SHong Zhang ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 1074222ddf1SHong Zhang if (flg) { 1084222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr); 1094222ddf1SHong Zhang PetscFunctionReturn(0); 1104222ddf1SHong Zhang } 1114222ddf1SHong Zhang 1124222ddf1SHong Zhang /* scalable_fast */ 1134222ddf1SHong Zhang ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr); 1144222ddf1SHong Zhang if (flg) { 1154222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr); 1164222ddf1SHong Zhang PetscFunctionReturn(0); 1174222ddf1SHong Zhang } 1184222ddf1SHong Zhang 1194222ddf1SHong Zhang /* heap */ 1204222ddf1SHong Zhang ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr); 1214222ddf1SHong Zhang if (flg) { 1224222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr); 1234222ddf1SHong Zhang PetscFunctionReturn(0); 1244222ddf1SHong Zhang } 1254222ddf1SHong Zhang 1264222ddf1SHong Zhang /* btheap */ 1274222ddf1SHong Zhang ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr); 1284222ddf1SHong Zhang if (flg) { 1294222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr); 1304222ddf1SHong Zhang PetscFunctionReturn(0); 1314222ddf1SHong Zhang } 1324222ddf1SHong Zhang 1334222ddf1SHong Zhang /* llcondensed */ 1344222ddf1SHong Zhang ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr); 1354222ddf1SHong Zhang if (flg) { 1364222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr); 1374222ddf1SHong Zhang PetscFunctionReturn(0); 1384222ddf1SHong Zhang } 1394222ddf1SHong Zhang 1404222ddf1SHong Zhang /* rowmerge */ 1414222ddf1SHong Zhang ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr); 1424222ddf1SHong Zhang if (flg) { 1434222ddf1SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr); 1444222ddf1SHong Zhang PetscFunctionReturn(0); 1454222ddf1SHong Zhang } 1464222ddf1SHong Zhang 1474222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE) 1484222ddf1SHong Zhang ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 1494222ddf1SHong Zhang if (flg) { 1504222ddf1SHong Zhang ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr); 1514222ddf1SHong Zhang PetscFunctionReturn(0); 1524222ddf1SHong Zhang } 1534222ddf1SHong Zhang #endif 1544222ddf1SHong Zhang 1554222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1564222ddf1SHong Zhang } 1574222ddf1SHong Zhang 1584222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C) 159b561aa9dSHong Zhang { 160b561aa9dSHong Zhang PetscErrorCode ierr; 161b561aa9dSHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 162971236abSHong Zhang PetscInt *ai=a->i,*bi=b->i,*ci,*cj; 163c1ba5575SJed Brown PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 164b561aa9dSHong Zhang PetscReal afill; 165eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 166eca6b86aSHong Zhang PetscTable ta; 167fb908031SHong Zhang PetscBT lnkbt; 1680298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 169b561aa9dSHong Zhang 170b561aa9dSHong Zhang PetscFunctionBegin; 171bd958071SHong Zhang /* Get ci and cj */ 172bd958071SHong Zhang /*---------------*/ 173fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 174fb908031SHong Zhang /* free space for accumulating nonzero column info */ 175854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 176fb908031SHong Zhang ci[0] = 0; 177fb908031SHong Zhang 178fb908031SHong Zhang /* create and initialize a linked list */ 179c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 180c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 181eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 182eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 183eca6b86aSHong Zhang 184eca6b86aSHong Zhang ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr); 185fb908031SHong Zhang 186fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 187f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 1882205254eSKarl Rupp 189fb908031SHong Zhang current_space = free_space; 190fb908031SHong Zhang 191fb908031SHong Zhang /* Determine ci and cj */ 192fb908031SHong Zhang for (i=0; i<am; i++) { 193fb908031SHong Zhang anzi = ai[i+1] - ai[i]; 194fb908031SHong Zhang aj = a->j + ai[i]; 195fb908031SHong Zhang for (j=0; j<anzi; j++) { 196fb908031SHong Zhang brow = aj[j]; 197fb908031SHong Zhang bnzj = bi[brow+1] - bi[brow]; 198fb908031SHong Zhang bj = b->j + bi[brow]; 199fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 200fb908031SHong Zhang ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr); 201fb908031SHong Zhang } 2028c78258cSHong Zhang /* add possible missing diagonal entry */ 2038c78258cSHong Zhang if (C->force_diagonals) { 2048c78258cSHong Zhang ierr = PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);CHKERRQ(ierr); 2058c78258cSHong Zhang } 206fb908031SHong Zhang cnzi = lnk[0]; 207fb908031SHong Zhang 208fb908031SHong Zhang /* If free space is not available, make more free space */ 209fb908031SHong Zhang /* Double the amount of total space in the list */ 210fb908031SHong Zhang if (current_space->local_remaining<cnzi) { 211f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 212fb908031SHong Zhang ndouble++; 213fb908031SHong Zhang } 214fb908031SHong Zhang 215fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 216fb908031SHong Zhang ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 2172205254eSKarl Rupp 218fb908031SHong Zhang current_space->array += cnzi; 219fb908031SHong Zhang current_space->local_used += cnzi; 220fb908031SHong Zhang current_space->local_remaining -= cnzi; 2212205254eSKarl Rupp 222fb908031SHong Zhang ci[i+1] = ci[i] + cnzi; 223fb908031SHong Zhang } 224fb908031SHong Zhang 225fb908031SHong Zhang /* Column indices are in the list of free space */ 226fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 227fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 228854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 229fb908031SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 230fb908031SHong Zhang ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 231b561aa9dSHong Zhang 23226be0446SHong Zhang /* put together the new symbolic matrix */ 233e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 2344222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 23558c24d83SHong Zhang 23658c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 23758c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 2384222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 239aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 240e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 24158c24d83SHong Zhang c->nonew = 0; 2424222ddf1SHong Zhang 2434222ddf1SHong Zhang /* fast, needs non-scalable O(bn) array 'abdense' */ 2444222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 2450b7e3e3dSHong Zhang 2467212da7cSHong Zhang /* set MatInfo */ 2477212da7cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 248f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 2494222ddf1SHong Zhang C->info.mallocs = ndouble; 2504222ddf1SHong Zhang C->info.fill_ratio_given = fill; 2514222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 2527212da7cSHong Zhang 2537212da7cSHong Zhang #if defined(PETSC_USE_INFO) 2547212da7cSHong Zhang if (ci[am]) { 2557d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 2567d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 257f2b054eeSHong Zhang } else { 2584222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 259be0fcf8dSHong Zhang } 260f2b054eeSHong Zhang #endif 26158c24d83SHong Zhang PetscFunctionReturn(0); 26258c24d83SHong Zhang } 263d50806bdSBarry Smith 264df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C) 265d50806bdSBarry Smith { 266dfbe8321SBarry Smith PetscErrorCode ierr; 267d13dce4bSSatish Balay PetscLogDouble flops=0.0; 268d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 269d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 270d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 27138baddfdSBarry Smith PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 272c58ca2e3SHong Zhang PetscInt am =A->rmap->n,cm=C->rmap->n; 273519eb980SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 2742e5835c6SStefano Zampini PetscScalar *ca,valtmp; 275aa1aec99SHong Zhang PetscScalar *ab_dense; 2766718818eSStefano Zampini PetscContainer cab_dense; 2772e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 278d50806bdSBarry Smith 279d50806bdSBarry Smith PetscFunctionBegin; 2802e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 2812e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 282aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 283854ce69bSBarry Smith ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 284aa1aec99SHong Zhang c->a = ca; 285aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 2866718818eSStefano Zampini } else ca = c->a; 2876718818eSStefano Zampini 2886718818eSStefano Zampini /* TODO this should be done in the symbolic phase */ 2896718818eSStefano Zampini /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 2906718818eSStefano Zampini that is hard to eradicate) */ 2916718818eSStefano Zampini ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr); 2926718818eSStefano Zampini if (!cab_dense) { 2936718818eSStefano Zampini ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr); 2946718818eSStefano Zampini ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr); 2956718818eSStefano Zampini ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr); 2966718818eSStefano Zampini ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr); 2976718818eSStefano Zampini ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr); 2986718818eSStefano Zampini ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr); 299d90d86d0SMatthew G. Knepley } 3006718818eSStefano Zampini ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr); 3016718818eSStefano Zampini ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr); 302aa1aec99SHong Zhang 303c124e916SHong Zhang /* clean old values in C */ 304580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 305d50806bdSBarry Smith /* Traverse A row-wise. */ 306d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 307d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 308d50806bdSBarry Smith for (i=0; i<am; i++) { 309d50806bdSBarry Smith anzi = ai[i+1] - ai[i]; 310d50806bdSBarry Smith for (j=0; j<anzi; j++) { 311519eb980SHong Zhang brow = aj[j]; 312d50806bdSBarry Smith bnzi = bi[brow+1] - bi[brow]; 313d50806bdSBarry Smith bjj = bj + bi[brow]; 314d50806bdSBarry Smith baj = ba + bi[brow]; 315519eb980SHong Zhang /* perform dense axpy */ 31636ec6d2dSHong Zhang valtmp = aa[j]; 317519eb980SHong Zhang for (k=0; k<bnzi; k++) { 31836ec6d2dSHong Zhang ab_dense[bjj[k]] += valtmp*baj[k]; 319519eb980SHong Zhang } 320519eb980SHong Zhang flops += 2*bnzi; 321519eb980SHong Zhang } 322c58ca2e3SHong Zhang aj += anzi; aa += anzi; 323c58ca2e3SHong Zhang 324c58ca2e3SHong Zhang cnzi = ci[i+1] - ci[i]; 325519eb980SHong Zhang for (k=0; k<cnzi; k++) { 326519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 327519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 328519eb980SHong Zhang } 329519eb980SHong Zhang flops += cnzi; 330519eb980SHong Zhang cj += cnzi; ca += cnzi; 331519eb980SHong Zhang } 3322e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3332e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3342e5835c6SStefano Zampini #endif 335c58ca2e3SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 336c58ca2e3SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 337c58ca2e3SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 3382e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 3392e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 340c58ca2e3SHong Zhang PetscFunctionReturn(0); 341c58ca2e3SHong Zhang } 342c58ca2e3SHong Zhang 34325023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C) 344c58ca2e3SHong Zhang { 345c58ca2e3SHong Zhang PetscErrorCode ierr; 346c58ca2e3SHong Zhang PetscLogDouble flops=0.0; 347c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 348c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 349c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 350c58ca2e3SHong Zhang PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j; 351c58ca2e3SHong Zhang PetscInt am = A->rmap->N,cm=C->rmap->N; 352c58ca2e3SHong Zhang PetscInt i,j,k,anzi,bnzi,cnzi,brow; 3532e5835c6SStefano Zampini PetscScalar *ca=c->a,valtmp; 3542e5835c6SStefano Zampini const PetscScalar *aa,*ba,*baj; 355c58ca2e3SHong Zhang PetscInt nextb; 356c58ca2e3SHong Zhang 357c58ca2e3SHong Zhang PetscFunctionBegin; 3582e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&aa);CHKERRQ(ierr); 3592e5835c6SStefano Zampini ierr = MatSeqAIJGetArrayRead(B,&ba);CHKERRQ(ierr); 360cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 361cf742fccSHong Zhang ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 362cf742fccSHong Zhang c->a = ca; 363cf742fccSHong Zhang c->free_a = PETSC_TRUE; 364cf742fccSHong Zhang } 365cf742fccSHong Zhang 366c58ca2e3SHong Zhang /* clean old values in C */ 367580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 368c58ca2e3SHong Zhang /* Traverse A row-wise. */ 369c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 370c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 371519eb980SHong Zhang for (i=0; i<am; i++) { 372519eb980SHong Zhang anzi = ai[i+1] - ai[i]; 373519eb980SHong Zhang cnzi = ci[i+1] - ci[i]; 374519eb980SHong Zhang for (j=0; j<anzi; j++) { 375519eb980SHong Zhang brow = aj[j]; 376519eb980SHong Zhang bnzi = bi[brow+1] - bi[brow]; 377519eb980SHong Zhang bjj = bj + bi[brow]; 378519eb980SHong Zhang baj = ba + bi[brow]; 379519eb980SHong Zhang /* perform sparse axpy */ 38036ec6d2dSHong Zhang valtmp = aa[j]; 381c124e916SHong Zhang nextb = 0; 382c124e916SHong Zhang for (k=0; nextb<bnzi; k++) { 383c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 38436ec6d2dSHong Zhang ca[k] += valtmp*baj[nextb++]; 385c124e916SHong Zhang } 386d50806bdSBarry Smith } 387d50806bdSBarry Smith flops += 2*bnzi; 388d50806bdSBarry Smith } 389519eb980SHong Zhang aj += anzi; aa += anzi; 390519eb980SHong Zhang cj += cnzi; ca += cnzi; 391d50806bdSBarry Smith } 3922e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3932e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3942e5835c6SStefano Zampini #endif 395716bacf3SKris Buschelman ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 396716bacf3SKris Buschelman ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 397d50806bdSBarry Smith ierr = PetscLogFlops(flops);CHKERRQ(ierr); 3982e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&aa);CHKERRQ(ierr); 3992e5835c6SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(B,&ba);CHKERRQ(ierr); 400d50806bdSBarry Smith PetscFunctionReturn(0); 401d50806bdSBarry Smith } 402bc011b1eSHong Zhang 4034222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C) 40425296bd5SBarry Smith { 40525296bd5SBarry Smith PetscErrorCode ierr; 40625296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 40725296bd5SBarry Smith PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 4083c50cad2SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 40925296bd5SBarry Smith MatScalar *ca; 41025296bd5SBarry Smith PetscReal afill; 411eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 412eca6b86aSHong Zhang PetscTable ta; 4130298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 41425296bd5SBarry Smith 41525296bd5SBarry Smith PetscFunctionBegin; 4163c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 4173c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 4183c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 419854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 4203c50cad2SHong Zhang ci[0] = 0; 4213c50cad2SHong Zhang 4223c50cad2SHong Zhang /* create and initialize a linked list */ 423c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 424c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 425eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 426eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 427eca6b86aSHong Zhang 428eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr); 4293c50cad2SHong Zhang 4303c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 431f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 4323c50cad2SHong Zhang current_space = free_space; 4333c50cad2SHong Zhang 4343c50cad2SHong Zhang /* Determine ci and cj */ 4353c50cad2SHong Zhang for (i=0; i<am; i++) { 4363c50cad2SHong Zhang anzi = ai[i+1] - ai[i]; 4373c50cad2SHong Zhang aj = a->j + ai[i]; 4383c50cad2SHong Zhang for (j=0; j<anzi; j++) { 4393c50cad2SHong Zhang brow = aj[j]; 4403c50cad2SHong Zhang bnzj = bi[brow+1] - bi[brow]; 4413c50cad2SHong Zhang bj = b->j + bi[brow]; 4423c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 4433c50cad2SHong Zhang ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr); 4443c50cad2SHong Zhang } 4458c78258cSHong Zhang /* add possible missing diagonal entry */ 4468c78258cSHong Zhang if (C->force_diagonals) { 4478c78258cSHong Zhang ierr = PetscLLCondensedAddSorted_fast(1,&i,lnk);CHKERRQ(ierr); 4488c78258cSHong Zhang } 4493c50cad2SHong Zhang cnzi = lnk[1]; 4503c50cad2SHong Zhang 4513c50cad2SHong Zhang /* If free space is not available, make more free space */ 4523c50cad2SHong Zhang /* Double the amount of total space in the list */ 4533c50cad2SHong Zhang if (current_space->local_remaining<cnzi) { 454f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 4553c50cad2SHong Zhang ndouble++; 4563c50cad2SHong Zhang } 4573c50cad2SHong Zhang 4583c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 4593c50cad2SHong Zhang ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr); 4602205254eSKarl Rupp 4613c50cad2SHong Zhang current_space->array += cnzi; 4623c50cad2SHong Zhang current_space->local_used += cnzi; 4633c50cad2SHong Zhang current_space->local_remaining -= cnzi; 4642205254eSKarl Rupp 4653c50cad2SHong Zhang ci[i+1] = ci[i] + cnzi; 4663c50cad2SHong Zhang } 4673c50cad2SHong Zhang 4683c50cad2SHong Zhang /* Column indices are in the list of free space */ 4693c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 4703c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 471854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 4723c50cad2SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 4733c50cad2SHong Zhang ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr); 47425296bd5SBarry Smith 47525296bd5SBarry Smith /* Allocate space for ca */ 476580bdb30SBarry Smith ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 47725296bd5SBarry Smith 47825296bd5SBarry Smith /* put together the new symbolic matrix */ 479e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 4804222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 48125296bd5SBarry Smith 48225296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 48325296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 4844222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 48525296bd5SBarry Smith c->free_a = PETSC_TRUE; 48625296bd5SBarry Smith c->free_ij = PETSC_TRUE; 48725296bd5SBarry Smith c->nonew = 0; 4882205254eSKarl Rupp 4894222ddf1SHong Zhang /* slower, less memory */ 4904222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 49125296bd5SBarry Smith 49225296bd5SBarry Smith /* set MatInfo */ 49325296bd5SBarry Smith afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 49425296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 4954222ddf1SHong Zhang C->info.mallocs = ndouble; 4964222ddf1SHong Zhang C->info.fill_ratio_given = fill; 4974222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 49825296bd5SBarry Smith 49925296bd5SBarry Smith #if defined(PETSC_USE_INFO) 50025296bd5SBarry Smith if (ci[am]) { 5017d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 5027d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 50325296bd5SBarry Smith } else { 5044222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 50525296bd5SBarry Smith } 50625296bd5SBarry Smith #endif 50725296bd5SBarry Smith PetscFunctionReturn(0); 50825296bd5SBarry Smith } 50925296bd5SBarry Smith 5104222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C) 511e9e4536cSHong Zhang { 512e9e4536cSHong Zhang PetscErrorCode ierr; 513e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 514bf9555e6SHong Zhang PetscInt *ai = a->i,*bi=b->i,*ci,*cj; 51525c41797SHong Zhang PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 516e9e4536cSHong Zhang MatScalar *ca; 517e9e4536cSHong Zhang PetscReal afill; 518eca6b86aSHong Zhang PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax; 519eca6b86aSHong Zhang PetscTable ta; 5200298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 521e9e4536cSHong Zhang 522e9e4536cSHong Zhang PetscFunctionBegin; 523bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 524bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 525bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 526854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 527bd958071SHong Zhang ci[0] = 0; 528bd958071SHong Zhang 529bd958071SHong Zhang /* create and initialize a linked list */ 530c373ccc6SHong Zhang ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr); 531c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b,bm,ta); 532eca6b86aSHong Zhang ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 533eca6b86aSHong Zhang ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 534eca6b86aSHong Zhang ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 535bd958071SHong Zhang 536bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 537f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 538bd958071SHong Zhang current_space = free_space; 539bd958071SHong Zhang 540bd958071SHong Zhang /* Determine ci and cj */ 541bd958071SHong Zhang for (i=0; i<am; i++) { 542bd958071SHong Zhang anzi = ai[i+1] - ai[i]; 543bd958071SHong Zhang aj = a->j + ai[i]; 544bd958071SHong Zhang for (j=0; j<anzi; j++) { 545bd958071SHong Zhang brow = aj[j]; 546bd958071SHong Zhang bnzj = bi[brow+1] - bi[brow]; 547bd958071SHong Zhang bj = b->j + bi[brow]; 548bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 549bd958071SHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr); 550bd958071SHong Zhang } 5518c78258cSHong Zhang /* add possible missing diagonal entry */ 5528c78258cSHong Zhang if (C->force_diagonals) { 5538c78258cSHong Zhang ierr = PetscLLCondensedAddSorted_Scalable(1,&i,lnk);CHKERRQ(ierr); 5548c78258cSHong Zhang } 5558c78258cSHong Zhang 556bd958071SHong Zhang cnzi = lnk[0]; 557bd958071SHong Zhang 558bd958071SHong Zhang /* If free space is not available, make more free space */ 559bd958071SHong Zhang /* Double the amount of total space in the list */ 560bd958071SHong Zhang if (current_space->local_remaining<cnzi) { 561f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 562bd958071SHong Zhang ndouble++; 563bd958071SHong Zhang } 564bd958071SHong Zhang 565bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 566bd958071SHong Zhang ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr); 5672205254eSKarl Rupp 568bd958071SHong Zhang current_space->array += cnzi; 569bd958071SHong Zhang current_space->local_used += cnzi; 570bd958071SHong Zhang current_space->local_remaining -= cnzi; 5712205254eSKarl Rupp 572bd958071SHong Zhang ci[i+1] = ci[i] + cnzi; 573bd958071SHong Zhang } 574bd958071SHong Zhang 575bd958071SHong Zhang /* Column indices are in the list of free space */ 576bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 577bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 578854ce69bSBarry Smith ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr); 579bd958071SHong Zhang ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 580bd958071SHong Zhang ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 581e9e4536cSHong Zhang 582e9e4536cSHong Zhang /* Allocate space for ca */ 583bd958071SHong Zhang /*-----------------------*/ 584580bdb30SBarry Smith ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr); 585e9e4536cSHong Zhang 586e9e4536cSHong Zhang /* put together the new symbolic matrix */ 587e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 5884222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 589e9e4536cSHong Zhang 590e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 591e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5924222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 593e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 594e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 595e9e4536cSHong Zhang c->nonew = 0; 5962205254eSKarl Rupp 5974222ddf1SHong Zhang /* slower, less memory */ 5984222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 599e9e4536cSHong Zhang 600e9e4536cSHong Zhang /* set MatInfo */ 601e9e4536cSHong Zhang afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 602e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 6034222ddf1SHong Zhang C->info.mallocs = ndouble; 6044222ddf1SHong Zhang C->info.fill_ratio_given = fill; 6054222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 606e9e4536cSHong Zhang 607e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 608e9e4536cSHong Zhang if (ci[am]) { 6097d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 6107d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 611e9e4536cSHong Zhang } else { 6124222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 613e9e4536cSHong Zhang } 614e9e4536cSHong Zhang #endif 615e9e4536cSHong Zhang PetscFunctionReturn(0); 616e9e4536cSHong Zhang } 617e9e4536cSHong Zhang 6184222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C) 6190ced3a2bSJed Brown { 6200ced3a2bSJed Brown PetscErrorCode ierr; 6210ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 6220ced3a2bSJed Brown const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j; 6230ced3a2bSJed Brown PetscInt *ci,*cj,*bb; 6240ced3a2bSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 6250ced3a2bSJed Brown PetscReal afill; 6260ced3a2bSJed Brown PetscInt i,j,col,ndouble = 0; 6270298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 6280ced3a2bSJed Brown PetscHeap h; 6290ced3a2bSJed Brown 6300ced3a2bSJed Brown PetscFunctionBegin; 631cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 6320ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 6330ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 634854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 6350ced3a2bSJed Brown ci[0] = 0; 6360ced3a2bSJed Brown 6370ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 638f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 6390ced3a2bSJed Brown current_space = free_space; 6400ced3a2bSJed Brown 6410ced3a2bSJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 642785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 6430ced3a2bSJed Brown 6440ced3a2bSJed Brown /* Determine ci and cj */ 6450ced3a2bSJed Brown for (i=0; i<am; i++) { 6460ced3a2bSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6470ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6480ced3a2bSJed Brown ci[i+1] = ci[i]; 6490ced3a2bSJed Brown /* Populate the min heap */ 6500ced3a2bSJed Brown for (j=0; j<anzi; j++) { 6510ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 6520ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */ 6530ced3a2bSJed Brown ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr); 6540ced3a2bSJed Brown } 6550ced3a2bSJed Brown } 6560ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 6570ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6580ced3a2bSJed Brown while (j >= 0) { 6590ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 660f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 6610ced3a2bSJed Brown ndouble++; 6620ced3a2bSJed Brown } 6630ced3a2bSJed Brown *(current_space->array++) = col; 6640ced3a2bSJed Brown current_space->local_used++; 6650ced3a2bSJed Brown current_space->local_remaining--; 6660ced3a2bSJed Brown ci[i+1]++; 6670ced3a2bSJed Brown 6680ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 6690ced3a2bSJed Brown if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);} 6700ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 6710ced3a2bSJed Brown PetscInt j2,col2; 6720ced3a2bSJed Brown ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr); 6730ced3a2bSJed Brown if (col2 != col) break; 6740ced3a2bSJed Brown ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr); 6750ced3a2bSJed Brown if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);} 6760ced3a2bSJed Brown } 6770ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 6780ced3a2bSJed Brown ierr = PetscHeapUnstash(h);CHKERRQ(ierr); 6790ced3a2bSJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 6800ced3a2bSJed Brown } 6810ced3a2bSJed Brown } 6820ced3a2bSJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 6830ced3a2bSJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 6840ced3a2bSJed Brown 6850ced3a2bSJed Brown /* Column indices are in the list of free space */ 6860ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 6870ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 688785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 6890ced3a2bSJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 6900ced3a2bSJed Brown 6910ced3a2bSJed Brown /* put together the new symbolic matrix */ 692e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 6934222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 6940ced3a2bSJed Brown 6950ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6960ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 6974222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 6980ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6990ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 7000ced3a2bSJed Brown c->nonew = 0; 70126fbe8dcSKarl Rupp 7024222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 7030ced3a2bSJed Brown 7040ced3a2bSJed Brown /* set MatInfo */ 7050ced3a2bSJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 7060ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 7074222ddf1SHong Zhang C->info.mallocs = ndouble; 7084222ddf1SHong Zhang C->info.fill_ratio_given = fill; 7094222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 7100ced3a2bSJed Brown 7110ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 7120ced3a2bSJed Brown if (ci[am]) { 7137d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 7147d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 7150ced3a2bSJed Brown } else { 7164222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 7170ced3a2bSJed Brown } 7180ced3a2bSJed Brown #endif 7190ced3a2bSJed Brown PetscFunctionReturn(0); 7200ced3a2bSJed Brown } 721e9e4536cSHong Zhang 7224222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C) 7238a07c6f1SJed Brown { 7248a07c6f1SJed Brown PetscErrorCode ierr; 7258a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 7268a07c6f1SJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 7278a07c6f1SJed Brown PetscInt *ci,*cj,*bb; 7288a07c6f1SJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 7298a07c6f1SJed Brown PetscReal afill; 7308a07c6f1SJed Brown PetscInt i,j,col,ndouble = 0; 7310298fd71SBarry Smith PetscFreeSpaceList free_space=NULL,current_space=NULL; 7328a07c6f1SJed Brown PetscHeap h; 7338a07c6f1SJed Brown PetscBT bt; 7348a07c6f1SJed Brown 7358a07c6f1SJed Brown PetscFunctionBegin; 736cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 7378a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 7388a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 739854ce69bSBarry Smith ierr = PetscMalloc1(am+2,&ci);CHKERRQ(ierr); 7408a07c6f1SJed Brown ci[0] = 0; 7418a07c6f1SJed Brown 7428a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 743f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr); 7442205254eSKarl Rupp 7458a07c6f1SJed Brown current_space = free_space; 7468a07c6f1SJed Brown 7478a07c6f1SJed Brown ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr); 748785e854fSJed Brown ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr); 7498a07c6f1SJed Brown ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr); 7508a07c6f1SJed Brown 7518a07c6f1SJed Brown /* Determine ci and cj */ 7528a07c6f1SJed Brown for (i=0; i<am; i++) { 7538a07c6f1SJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 7548a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 7558a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 7568a07c6f1SJed Brown ci[i+1] = ci[i]; 7578a07c6f1SJed Brown /* Populate the min heap */ 7588a07c6f1SJed Brown for (j=0; j<anzi; j++) { 7598a07c6f1SJed Brown PetscInt brow = acol[j]; 7608a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) { 7618a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7628a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7638a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 7648a07c6f1SJed Brown bb[j]++; 7658a07c6f1SJed Brown break; 7668a07c6f1SJed Brown } 7678a07c6f1SJed Brown } 7688a07c6f1SJed Brown } 7698a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 7708a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 7718a07c6f1SJed Brown while (j >= 0) { 7728a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 7730298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 774f91af8c7SBarry Smith ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);CHKERRQ(ierr); 7758a07c6f1SJed Brown ndouble++; 7768a07c6f1SJed Brown } 7778a07c6f1SJed Brown *(current_space->array++) = col; 7788a07c6f1SJed Brown current_space->local_used++; 7798a07c6f1SJed Brown current_space->local_remaining--; 7808a07c6f1SJed Brown ci[i+1]++; 7818a07c6f1SJed Brown 7828a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 7838a07c6f1SJed Brown for (; bb[j] < bi[acol[j]+1]; bb[j]++) { 7848a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7858a07c6f1SJed Brown if (!PetscBTLookupSet(bt,bcol)) { /* new entry */ 7868a07c6f1SJed Brown ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr); 7878a07c6f1SJed Brown bb[j]++; 7888a07c6f1SJed Brown break; 7898a07c6f1SJed Brown } 7908a07c6f1SJed Brown } 7918a07c6f1SJed Brown ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr); 7928a07c6f1SJed Brown } 7938a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7948a07c6f1SJed Brown for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);} 7958a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7968a07c6f1SJed Brown ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr); 7978a07c6f1SJed Brown } 7988a07c6f1SJed Brown } 7998a07c6f1SJed Brown ierr = PetscFree(bb);CHKERRQ(ierr); 8008a07c6f1SJed Brown ierr = PetscHeapDestroy(&h);CHKERRQ(ierr); 8018a07c6f1SJed Brown ierr = PetscBTDestroy(&bt);CHKERRQ(ierr); 8028a07c6f1SJed Brown 8038a07c6f1SJed Brown /* Column indices are in the list of free space */ 8048a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 8058a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 806785e854fSJed Brown ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr); 8078a07c6f1SJed Brown ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 8088a07c6f1SJed Brown 8098a07c6f1SJed Brown /* put together the new symbolic matrix */ 810e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 8114222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 8128a07c6f1SJed Brown 8138a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 8148a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 8154222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 8168a07c6f1SJed Brown c->free_a = PETSC_TRUE; 8178a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 8188a07c6f1SJed Brown c->nonew = 0; 81926fbe8dcSKarl Rupp 8204222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 8218a07c6f1SJed Brown 8228a07c6f1SJed Brown /* set MatInfo */ 8238a07c6f1SJed Brown afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 8248a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 8254222ddf1SHong Zhang C->info.mallocs = ndouble; 8264222ddf1SHong Zhang C->info.fill_ratio_given = fill; 8274222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 8288a07c6f1SJed Brown 8298a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 8308a07c6f1SJed Brown if (ci[am]) { 8317d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 8327d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 8338a07c6f1SJed Brown } else { 8344222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 8358a07c6f1SJed Brown } 8368a07c6f1SJed Brown #endif 8378a07c6f1SJed Brown PetscFunctionReturn(0); 8388a07c6f1SJed Brown } 8398a07c6f1SJed Brown 8404222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C) 841d7ed1a4dSandi selinger { 842d7ed1a4dSandi selinger PetscErrorCode ierr; 843d7ed1a4dSandi selinger Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 844d7ed1a4dSandi selinger const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1; 845d7ed1a4dSandi selinger PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9]; 846d7ed1a4dSandi selinger PetscInt c_maxmem,a_maxrownnz=0,a_rownnz; 847d7ed1a4dSandi selinger const PetscInt workcol[8]={0,1,2,3,4,5,6,7}; 848d7ed1a4dSandi selinger const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 849d7ed1a4dSandi selinger const PetscInt *brow_ptr[8],*brow_end[8]; 850d7ed1a4dSandi selinger PetscInt window[8]; 851d7ed1a4dSandi selinger PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows; 852d7ed1a4dSandi selinger PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft; 853d7ed1a4dSandi selinger PetscReal afill; 854f83700f2Sandi selinger PetscInt *workj_L1,*workj_L2,*workj_L3; 8557660c4dbSandi selinger PetscInt L1_nnz,L2_nnz; 856d7ed1a4dSandi selinger 857d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 858d7ed1a4dSandi selinger Because of the way virtual memory works, 859d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 860d7ed1a4dSandi selinger PetscFunctionBegin; 861d7ed1a4dSandi selinger ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 862d7ed1a4dSandi selinger for (i=0; i<am; i++) { 863d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 864d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 865d7ed1a4dSandi selinger a_rownnz = 0; 866d7ed1a4dSandi selinger for (k=0; k<anzi; ++k) { 867d7ed1a4dSandi selinger a_rownnz += bi[acol[k]+1] - bi[acol[k]]; 868d7ed1a4dSandi selinger if (a_rownnz > bn) { 869d7ed1a4dSandi selinger a_rownnz = bn; 870d7ed1a4dSandi selinger break; 871d7ed1a4dSandi selinger } 872d7ed1a4dSandi selinger } 873d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 874d7ed1a4dSandi selinger } 875d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 876d7ed1a4dSandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr); 877f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr); 878f83700f2Sandi selinger ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr); 879d7ed1a4dSandi selinger 8807660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 8817660c4dbSandi selinger c_maxmem = 8*(ai[am]+bi[bm]); 882d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 883d7ed1a4dSandi selinger ierr = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr); 884d7ed1a4dSandi selinger 885d7ed1a4dSandi selinger ci_nnz = 0; 886d7ed1a4dSandi selinger ci[0] = 0; 887d7ed1a4dSandi selinger worki_L1[0] = 0; 888d7ed1a4dSandi selinger worki_L2[0] = 0; 889d7ed1a4dSandi selinger for (i=0; i<am; i++) { 890d7ed1a4dSandi selinger const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 891d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 892d7ed1a4dSandi selinger rowsleft = anzi; 893d7ed1a4dSandi selinger inputcol_L1 = acol; 894d7ed1a4dSandi selinger L2_nnz = 0; 8957660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8967660c4dbSandi selinger worki_L2[1] = 0; 89708fe1b3cSKarl Rupp outputi_nnz = 0; 898d7ed1a4dSandi selinger 899d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 900d7ed1a4dSandi selinger while (ci_nnz+a_maxrownnz > c_maxmem) { 901d7ed1a4dSandi selinger c_maxmem *= 2; 902d7ed1a4dSandi selinger ndouble++; 903d7ed1a4dSandi selinger ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr); 904d7ed1a4dSandi selinger } 905d7ed1a4dSandi selinger 906d7ed1a4dSandi selinger while (rowsleft) { 907d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 908d7ed1a4dSandi selinger L1_nrows = 0; 9097660c4dbSandi selinger L1_nnz = 0; 910d7ed1a4dSandi selinger inputcol = inputcol_L1; 911d7ed1a4dSandi selinger inputi = bi; 912d7ed1a4dSandi selinger inputj = bj; 913d7ed1a4dSandi selinger 914d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 915d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 916f83700f2Sandi selinger Input: inputj inputi inputcol bn 917d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 918d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 919d7ed1a4dSandi selinger window_min = bn; \ 9207660c4dbSandi selinger outputi_nnz = 0; \ 9217660c4dbSandi selinger for (k=0; k<ANNZ; ++k) { \ 922d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 923d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k]+1]; \ 924d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 925d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 926d7ed1a4dSandi selinger } \ 927d7ed1a4dSandi selinger while (window_min < bn) { \ 928d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 929d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 930d7ed1a4dSandi selinger old_window_min = window_min; \ 931d7ed1a4dSandi selinger window_min = bn; \ 932d7ed1a4dSandi selinger for (k=0; k<ANNZ; ++k) { \ 933d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 934d7ed1a4dSandi selinger brow_ptr[k]++; \ 935d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 936d7ed1a4dSandi selinger } \ 937d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 938d7ed1a4dSandi selinger } \ 939d7ed1a4dSandi selinger } 940d7ed1a4dSandi selinger 941d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 942d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 943d7ed1a4dSandi selinger while (L1_rowsleft) { 9447660c4dbSandi selinger outputi_nnz = 0; 9457660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 9467660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 9477660c4dbSandi selinger 948d7ed1a4dSandi selinger switch (L1_rowsleft) { 949d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 950d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 951d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 952d7ed1a4dSandi selinger inputcol += L1_rowsleft; 953d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 954d7ed1a4dSandi selinger L1_rowsleft = 0; 955d7ed1a4dSandi selinger break; 956d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); 957d7ed1a4dSandi selinger inputcol += L1_rowsleft; 958d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 959d7ed1a4dSandi selinger L1_rowsleft = 0; 960d7ed1a4dSandi selinger break; 961d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); 962d7ed1a4dSandi selinger inputcol += L1_rowsleft; 963d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 964d7ed1a4dSandi selinger L1_rowsleft = 0; 965d7ed1a4dSandi selinger break; 966d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); 967d7ed1a4dSandi selinger inputcol += L1_rowsleft; 968d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 969d7ed1a4dSandi selinger L1_rowsleft = 0; 970d7ed1a4dSandi selinger break; 971d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); 972d7ed1a4dSandi selinger inputcol += L1_rowsleft; 973d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 974d7ed1a4dSandi selinger L1_rowsleft = 0; 975d7ed1a4dSandi selinger break; 976d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); 977d7ed1a4dSandi selinger inputcol += L1_rowsleft; 978d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 979d7ed1a4dSandi selinger L1_rowsleft = 0; 980d7ed1a4dSandi selinger break; 981d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); 982d7ed1a4dSandi selinger inputcol += L1_rowsleft; 983d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 984d7ed1a4dSandi selinger L1_rowsleft = 0; 985d7ed1a4dSandi selinger break; 986d7ed1a4dSandi selinger default: MatMatMultSymbolic_RowMergeMacro(8); 987d7ed1a4dSandi selinger inputcol += 8; 988d7ed1a4dSandi selinger rowsleft -= 8; 989d7ed1a4dSandi selinger L1_rowsleft -= 8; 990d7ed1a4dSandi selinger break; 991d7ed1a4dSandi selinger } 992d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9937660c4dbSandi selinger L1_nnz += outputi_nnz; 9947660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 995d7ed1a4dSandi selinger } 996d7ed1a4dSandi selinger 997d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 998d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 999d7ed1a4dSandi selinger if (anzi > 8) { 1000d7ed1a4dSandi selinger inputi = worki_L1; 1001d7ed1a4dSandi selinger inputj = workj_L1; 1002d7ed1a4dSandi selinger inputcol = workcol; 1003d7ed1a4dSandi selinger outputi_nnz = 0; 1004d7ed1a4dSandi selinger 1005d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 1006d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 1007d7ed1a4dSandi selinger 1008d7ed1a4dSandi selinger switch (L1_nrows) { 1009d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1010d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 1011d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1012d7ed1a4dSandi selinger break; 1013d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1014d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1015d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1016d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1017d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1018d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1019d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1020d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1021d7ed1a4dSandi selinger } 1022d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 1023d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 1024d7ed1a4dSandi selinger 10257660c4dbSandi selinger /************************ L E V E L 3 **********************/ 10267660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 1027d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1028d7ed1a4dSandi selinger inputi = worki_L2; 1029d7ed1a4dSandi selinger inputj = workj_L2; 1030d7ed1a4dSandi selinger inputcol = workcol; 1031d7ed1a4dSandi selinger outputi_nnz = 0; 1032f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 1033d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 1034d7ed1a4dSandi selinger switch (L2_nrows) { 1035d7ed1a4dSandi selinger case 1: brow_ptr[0] = inputj + inputi[inputcol[0]]; 1036d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0]+1]; 1037d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1038d7ed1a4dSandi selinger break; 1039d7ed1a4dSandi selinger case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1040d7ed1a4dSandi selinger case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1041d7ed1a4dSandi selinger case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1042d7ed1a4dSandi selinger case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1043d7ed1a4dSandi selinger case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1044d7ed1a4dSandi selinger case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1045d7ed1a4dSandi selinger case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1046d7ed1a4dSandi selinger default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1047d7ed1a4dSandi selinger } 1048d7ed1a4dSandi selinger L2_nrows = 1; 10497660c4dbSandi selinger L2_nnz = outputi_nnz; 10507660c4dbSandi selinger worki_L2[1] = outputi_nnz; 10517660c4dbSandi selinger /* Copy to workj_L2 */ 1052d7ed1a4dSandi selinger if (rowsleft) { 10537660c4dbSandi selinger for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1054d7ed1a4dSandi selinger } 1055d7ed1a4dSandi selinger } 1056d7ed1a4dSandi selinger } 1057d7ed1a4dSandi selinger } /* while (rowsleft) */ 1058d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 1059d7ed1a4dSandi selinger 1060d7ed1a4dSandi selinger /* terminate current row */ 1061d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 1062d7ed1a4dSandi selinger ci[i+1] = ci_nnz; 1063d7ed1a4dSandi selinger } 1064d7ed1a4dSandi selinger 1065d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 1066e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 10674222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1068d7ed1a4dSandi selinger 1069d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1070d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 10714222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1072d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 1073d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 1074d7ed1a4dSandi selinger c->nonew = 0; 1075d7ed1a4dSandi selinger 10764222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1077d7ed1a4dSandi selinger 1078d7ed1a4dSandi selinger /* set MatInfo */ 1079d7ed1a4dSandi selinger afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5; 1080d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 10814222ddf1SHong Zhang C->info.mallocs = ndouble; 10824222ddf1SHong Zhang C->info.fill_ratio_given = fill; 10834222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1084d7ed1a4dSandi selinger 1085d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1086d7ed1a4dSandi selinger if (ci[am]) { 10877d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 10887d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1089d7ed1a4dSandi selinger } else { 10904222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1091d7ed1a4dSandi selinger } 1092d7ed1a4dSandi selinger #endif 1093d7ed1a4dSandi selinger 1094d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 1095d7ed1a4dSandi selinger ierr = PetscFree(workj_L1);CHKERRQ(ierr); 1096d7ed1a4dSandi selinger ierr = PetscFree(workj_L2);CHKERRQ(ierr); 1097f83700f2Sandi selinger ierr = PetscFree(workj_L3);CHKERRQ(ierr); 1098d7ed1a4dSandi selinger PetscFunctionReturn(0); 1099d7ed1a4dSandi selinger } 1100d7ed1a4dSandi selinger 1101cd093f1dSJed Brown /* concatenate unique entries and then sort */ 11024222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C) 1103cd093f1dSJed Brown { 1104cd093f1dSJed Brown PetscErrorCode ierr; 1105cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c; 1106cd093f1dSJed Brown const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j; 11078c78258cSHong Zhang PetscInt *ci,*cj,bcol; 1108cd093f1dSJed Brown PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N; 1109cd093f1dSJed Brown PetscReal afill; 1110cd093f1dSJed Brown PetscInt i,j,ndouble = 0; 1111cd093f1dSJed Brown PetscSegBuffer seg,segrow; 1112cd093f1dSJed Brown char *seen; 1113cd093f1dSJed Brown 1114cd093f1dSJed Brown PetscFunctionBegin; 1115854ce69bSBarry Smith ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr); 1116cd093f1dSJed Brown ci[0] = 0; 1117cd093f1dSJed Brown 1118cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1119cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr); 1120cd093f1dSJed Brown ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr); 1121580bdb30SBarry Smith ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr); 1122cd093f1dSJed Brown 1123cd093f1dSJed Brown /* Determine ci and cj */ 1124cd093f1dSJed Brown for (i=0; i<am; i++) { 1125cd093f1dSJed Brown const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1126cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1127cd093f1dSJed Brown PetscInt packlen = 0,*PETSC_RESTRICT crow; 11288c78258cSHong Zhang 1129cd093f1dSJed Brown /* Pack segrow */ 1130cd093f1dSJed Brown for (j=0; j<anzi; j++) { 1131cd093f1dSJed Brown PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k; 1132cd093f1dSJed Brown for (k=bjstart; k<bjend; k++) { 11338c78258cSHong Zhang bcol = bj[k]; 1134cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1135cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 1136cd093f1dSJed Brown ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 1137cd093f1dSJed Brown *slot = bcol; 1138cd093f1dSJed Brown seen[bcol] = 1; 1139cd093f1dSJed Brown packlen++; 1140cd093f1dSJed Brown } 1141cd093f1dSJed Brown } 1142cd093f1dSJed Brown } 11438c78258cSHong Zhang 11448c78258cSHong Zhang /* Check i-th diagonal entry */ 11458c78258cSHong Zhang if (C->force_diagonals && !seen[i]) { 11468c78258cSHong Zhang PetscInt *PETSC_RESTRICT slot; 11478c78258cSHong Zhang ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr); 11488c78258cSHong Zhang *slot = i; 11498c78258cSHong Zhang seen[i] = 1; 11508c78258cSHong Zhang packlen++; 11518c78258cSHong Zhang } 11528c78258cSHong Zhang 1153cd093f1dSJed Brown ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr); 1154cd093f1dSJed Brown ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr); 1155cd093f1dSJed Brown ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr); 1156cd093f1dSJed Brown ci[i+1] = ci[i] + packlen; 1157cd093f1dSJed Brown for (j=0; j<packlen; j++) seen[crow[j]] = 0; 1158cd093f1dSJed Brown } 1159cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr); 1160cd093f1dSJed Brown ierr = PetscFree(seen);CHKERRQ(ierr); 1161cd093f1dSJed Brown 1162cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 1163cd093f1dSJed Brown ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr); 1164cd093f1dSJed Brown ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr); 1165cd093f1dSJed Brown 1166cd093f1dSJed Brown /* put together the new symbolic matrix */ 1167e4e71118SStefano Zampini ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr); 11684222ddf1SHong Zhang ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr); 1169cd093f1dSJed Brown 1170cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1171cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 11724222ddf1SHong Zhang c = (Mat_SeqAIJ*)(C->data); 1173cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1174cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1175cd093f1dSJed Brown c->nonew = 0; 1176cd093f1dSJed Brown 11774222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1178cd093f1dSJed Brown 1179cd093f1dSJed Brown /* set MatInfo */ 11802a09556fSStefano Zampini afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5; 1181cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 11824222ddf1SHong Zhang C->info.mallocs = ndouble; 11834222ddf1SHong Zhang C->info.fill_ratio_given = fill; 11844222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1185cd093f1dSJed Brown 1186cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1187cd093f1dSJed Brown if (ci[am]) { 11887d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr); 11897d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 1190cd093f1dSJed Brown } else { 11914222ddf1SHong Zhang ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr); 1192cd093f1dSJed Brown } 1193cd093f1dSJed Brown #endif 1194cd093f1dSJed Brown PetscFunctionReturn(0); 1195cd093f1dSJed Brown } 1196cd093f1dSJed Brown 11976718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 11982128a86cSHong Zhang { 11992128a86cSHong Zhang PetscErrorCode ierr; 12006718818eSStefano Zampini Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data; 12012128a86cSHong Zhang 12022128a86cSHong Zhang PetscFunctionBegin; 120340192850SHong Zhang ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr); 120440192850SHong Zhang ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr); 120540192850SHong Zhang ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr); 120640192850SHong Zhang ierr = PetscFree(abt);CHKERRQ(ierr); 12072128a86cSHong Zhang PetscFunctionReturn(0); 12082128a86cSHong Zhang } 12092128a86cSHong Zhang 12104222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 1211bc011b1eSHong Zhang { 12125df89d91SHong Zhang PetscErrorCode ierr; 121381d82fe4SHong Zhang Mat Bt; 121481d82fe4SHong Zhang PetscInt *bti,*btj; 121540192850SHong Zhang Mat_MatMatTransMult *abt; 12164222ddf1SHong Zhang Mat_Product *product = C->product; 12176718818eSStefano Zampini char *alg; 1218d2853540SHong Zhang 121981d82fe4SHong Zhang PetscFunctionBegin; 12202c71b3e2SJacob Faibussowitsch PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 12212c71b3e2SJacob Faibussowitsch PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 12226718818eSStefano Zampini 122381d82fe4SHong Zhang /* create symbolic Bt */ 122481d82fe4SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12250298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr); 122633d57670SJed Brown ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 122704b858e0SBarry Smith ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr); 122881d82fe4SHong Zhang 122981d82fe4SHong Zhang /* get symbolic C=A*Bt */ 12306718818eSStefano Zampini ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr); 12314222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */ 123281d82fe4SHong Zhang ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr); 12334222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */ 12346718818eSStefano Zampini ierr = PetscFree(alg);CHKERRQ(ierr); 123581d82fe4SHong Zhang 1236a5b23f4aSJose E. Roman /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1237b00a9115SJed Brown ierr = PetscNew(&abt);CHKERRQ(ierr); 12382128a86cSHong Zhang 12396718818eSStefano Zampini product->data = abt; 12406718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 12416718818eSStefano Zampini 12424222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 12432128a86cSHong Zhang 12444222ddf1SHong Zhang abt->usecoloring = PETSC_FALSE; 12454222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr); 124640192850SHong Zhang if (abt->usecoloring) { 1247b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1248b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1249335efc43SPeter Brune MatColoring coloring; 12502128a86cSHong Zhang ISColoring iscoloring; 12512128a86cSHong Zhang Mat Bt_dense,C_dense; 1252e8354b3eSHong Zhang 12534222ddf1SHong Zhang /* inode causes memory problem */ 12544222ddf1SHong Zhang ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr); 12554222ddf1SHong Zhang 12564222ddf1SHong Zhang ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr); 1257335efc43SPeter Brune ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr); 1258335efc43SPeter Brune ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr); 1259335efc43SPeter Brune ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr); 1260335efc43SPeter Brune ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr); 1261335efc43SPeter Brune ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr); 12624222ddf1SHong Zhang ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr); 12632205254eSKarl Rupp 126440192850SHong Zhang abt->matcoloring = matcoloring; 12652205254eSKarl Rupp 12662128a86cSHong Zhang ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr); 12672128a86cSHong Zhang 12682128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 12692128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr); 12702128a86cSHong Zhang ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12712128a86cSHong Zhang ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr); 12720298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr); 12732205254eSKarl Rupp 12742128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 127540192850SHong Zhang abt->Bt_den = Bt_dense; 12762128a86cSHong Zhang 12772128a86cSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr); 12782128a86cSHong Zhang ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr); 12792128a86cSHong Zhang ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr); 12800298fd71SBarry Smith ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr); 12812205254eSKarl Rupp 12822128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 128340192850SHong Zhang abt->ABt_den = C_dense; 1284f94ccd6cSHong Zhang 1285f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12861ce71dffSSatish Balay { 12874222ddf1SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 12887d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors))));CHKERRQ(ierr); 12891ce71dffSSatish Balay } 1290f94ccd6cSHong Zhang #endif 12912128a86cSHong Zhang } 1292e99be685SHong Zhang /* clean up */ 1293e99be685SHong Zhang ierr = MatDestroy(&Bt);CHKERRQ(ierr); 1294e99be685SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr); 12955df89d91SHong Zhang PetscFunctionReturn(0); 12965df89d91SHong Zhang } 12975df89d91SHong Zhang 12986fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 12995df89d91SHong Zhang { 13005df89d91SHong Zhang PetscErrorCode ierr; 13015df89d91SHong Zhang Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1302e2cac8caSJed Brown PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow; 130381d82fe4SHong Zhang PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol; 13045df89d91SHong Zhang PetscLogDouble flops=0.0; 1305aa1aec99SHong Zhang MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval; 13066718818eSStefano Zampini Mat_MatMatTransMult *abt; 13076718818eSStefano Zampini Mat_Product *product = C->product; 13085df89d91SHong Zhang 13095df89d91SHong Zhang PetscFunctionBegin; 13102c71b3e2SJacob Faibussowitsch PetscCheckFalse(!product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 13116718818eSStefano Zampini abt = (Mat_MatMatTransMult *)product->data; 13122c71b3e2SJacob Faibussowitsch PetscCheckFalse(!abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 131358ed3ceaSHong Zhang /* clear old values in C */ 131458ed3ceaSHong Zhang if (!c->a) { 1315580bdb30SBarry Smith ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 131658ed3ceaSHong Zhang c->a = ca; 131758ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 131858ed3ceaSHong Zhang } else { 131958ed3ceaSHong Zhang ca = c->a; 1320580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr); 132158ed3ceaSHong Zhang } 132258ed3ceaSHong Zhang 132340192850SHong Zhang if (abt->usecoloring) { 132440192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 132540192850SHong Zhang Mat Bt_dense,C_dense = abt->ABt_den; 1326c8db22f6SHong Zhang 1327b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 132840192850SHong Zhang Bt_dense = abt->Bt_den; 1329b9af6bddSHong Zhang ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr); 1330c8db22f6SHong Zhang 1331c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 13322128a86cSHong Zhang ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr); 1333c8db22f6SHong Zhang 13342128a86cSHong Zhang /* Recover C from C_dense */ 1335b9af6bddSHong Zhang ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr); 1336c8db22f6SHong Zhang PetscFunctionReturn(0); 1337c8db22f6SHong Zhang } 1338c8db22f6SHong Zhang 13391fa1209cSHong Zhang for (i=0; i<cm; i++) { 134081d82fe4SHong Zhang anzi = ai[i+1] - ai[i]; 134181d82fe4SHong Zhang acol = aj + ai[i]; 13426973769bSHong Zhang aval = aa + ai[i]; 13431fa1209cSHong Zhang cnzi = ci[i+1] - ci[i]; 13441fa1209cSHong Zhang ccol = cj + ci[i]; 13456973769bSHong Zhang cval = ca + ci[i]; 13461fa1209cSHong Zhang for (j=0; j<cnzi; j++) { 134781d82fe4SHong Zhang brow = ccol[j]; 134881d82fe4SHong Zhang bnzj = bi[brow+1] - bi[brow]; 134981d82fe4SHong Zhang bcol = bj + bi[brow]; 13506973769bSHong Zhang bval = ba + bi[brow]; 13516973769bSHong Zhang 135281d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 135381d82fe4SHong Zhang nexta = 0; nextb = 0; 135481d82fe4SHong Zhang while (nexta<anzi && nextb<bnzj) { 13557b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 135681d82fe4SHong Zhang if (nexta == anzi) break; 13577b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 135881d82fe4SHong Zhang if (nextb == bnzj) break; 135981d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 13606973769bSHong Zhang cval[j] += aval[nexta]*bval[nextb]; 136181d82fe4SHong Zhang nexta++; nextb++; 136281d82fe4SHong Zhang flops += 2; 13631fa1209cSHong Zhang } 13641fa1209cSHong Zhang } 136581d82fe4SHong Zhang } 136681d82fe4SHong Zhang } 136781d82fe4SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 136881d82fe4SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 136981d82fe4SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 13705df89d91SHong Zhang PetscFunctionReturn(0); 13715df89d91SHong Zhang } 13725df89d91SHong Zhang 13736718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 13746d373c3eSHong Zhang { 13756d373c3eSHong Zhang PetscErrorCode ierr; 13766718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data; 13776d373c3eSHong Zhang 13786d373c3eSHong Zhang PetscFunctionBegin; 13796d373c3eSHong Zhang ierr = MatDestroy(&atb->At);CHKERRQ(ierr); 13806718818eSStefano Zampini if (atb->destroy) { 13816718818eSStefano Zampini ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr); 13826473ade5SStefano Zampini } 13836d373c3eSHong Zhang ierr = PetscFree(atb);CHKERRQ(ierr); 13846d373c3eSHong Zhang PetscFunctionReturn(0); 13856d373c3eSHong Zhang } 13866d373c3eSHong Zhang 13874222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C) 13885df89d91SHong Zhang { 1389bc011b1eSHong Zhang PetscErrorCode ierr; 1390089a957eSStefano Zampini Mat At = NULL; 139138baddfdSBarry Smith PetscInt *ati,*atj; 13924222ddf1SHong Zhang Mat_Product *product = C->product; 1393089a957eSStefano Zampini PetscBool flg,def,square; 1394bc011b1eSHong Zhang 1395bc011b1eSHong Zhang PetscFunctionBegin; 1396089a957eSStefano Zampini MatCheckProduct(C,4); 1397089a957eSStefano Zampini square = (PetscBool)(A == B && A->symmetric && A->symmetric_set); 13984222ddf1SHong Zhang /* outerproduct */ 1399089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"outerproduct",&flg);CHKERRQ(ierr); 14004222ddf1SHong Zhang if (flg) { 1401bc011b1eSHong Zhang /* create symbolic At */ 1402089a957eSStefano Zampini if (!square) { 1403bc011b1eSHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 14040298fd71SBarry Smith ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr); 140533d57670SJed Brown ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr); 140604b858e0SBarry Smith ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr); 1407089a957eSStefano Zampini } 1408bc011b1eSHong Zhang /* get symbolic C=At*B */ 14097a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1410089a957eSStefano Zampini ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 1411bc011b1eSHong Zhang 1412bc011b1eSHong Zhang /* clean up */ 1413089a957eSStefano Zampini if (!square) { 14146bf464f9SBarry Smith ierr = MatDestroy(&At);CHKERRQ(ierr); 1415bc011b1eSHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr); 1416089a957eSStefano Zampini } 14176d373c3eSHong Zhang 14184222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 14197a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr); 14204222ddf1SHong Zhang PetscFunctionReturn(0); 14214222ddf1SHong Zhang } 14224222ddf1SHong Zhang 14234222ddf1SHong Zhang /* matmatmult */ 1424089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"default",&def);CHKERRQ(ierr); 1425089a957eSStefano Zampini ierr = PetscStrcmp(product->alg,"at*b",&flg);CHKERRQ(ierr); 1426089a957eSStefano Zampini if (flg || def) { 14274222ddf1SHong Zhang Mat_MatTransMatMult *atb; 14284222ddf1SHong Zhang 14292c71b3e2SJacob Faibussowitsch PetscCheckFalse(product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty"); 14304222ddf1SHong Zhang ierr = PetscNew(&atb);CHKERRQ(ierr); 1431089a957eSStefano Zampini if (!square) { 14324222ddf1SHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr); 1433089a957eSStefano Zampini } 14347a3c3d58SStefano Zampini ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); 1435089a957eSStefano Zampini ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);CHKERRQ(ierr); 14366718818eSStefano Zampini ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr); 14376718818eSStefano Zampini product->data = atb; 14386718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 14394222ddf1SHong Zhang atb->At = At; 14404222ddf1SHong Zhang atb->updateAt = PETSC_FALSE; /* because At is computed here */ 14414222ddf1SHong Zhang 14424222ddf1SHong Zhang C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 14434222ddf1SHong Zhang PetscFunctionReturn(0); 14444222ddf1SHong Zhang } 14454222ddf1SHong Zhang 14464222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 1447bc011b1eSHong Zhang } 1448bc011b1eSHong Zhang 144975648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C) 1450bc011b1eSHong Zhang { 1451bc011b1eSHong Zhang PetscErrorCode ierr; 14520fbc74f4SHong Zhang Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data; 1453d0f46423SBarry Smith PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb; 1454d0f46423SBarry Smith PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k; 1455d13dce4bSSatish Balay PetscLogDouble flops=0.0; 1456aa1aec99SHong Zhang MatScalar *aa=a->a,*ba,*ca,*caj; 1457bc011b1eSHong Zhang 1458bc011b1eSHong Zhang PetscFunctionBegin; 1459aa1aec99SHong Zhang if (!c->a) { 1460580bdb30SBarry Smith ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr); 14612205254eSKarl Rupp 1462aa1aec99SHong Zhang c->a = ca; 1463aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1464aa1aec99SHong Zhang } else { 1465aa1aec99SHong Zhang ca = c->a; 1466580bdb30SBarry Smith ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr); 1467aa1aec99SHong Zhang } 1468bc011b1eSHong Zhang 1469bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1470bc011b1eSHong Zhang for (i=0; i<am; i++) { 1471bc011b1eSHong Zhang bj = b->j + bi[i]; 1472bc011b1eSHong Zhang ba = b->a + bi[i]; 1473bc011b1eSHong Zhang bnzi = bi[i+1] - bi[i]; 1474bc011b1eSHong Zhang anzi = ai[i+1] - ai[i]; 1475bc011b1eSHong Zhang for (j=0; j<anzi; j++) { 1476bc011b1eSHong Zhang nextb = 0; 14770fbc74f4SHong Zhang crow = *aj++; 1478bc011b1eSHong Zhang cjj = cj + ci[crow]; 1479bc011b1eSHong Zhang caj = ca + ci[crow]; 1480bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1481bc011b1eSHong Zhang for (k=0; nextb<bnzi; k++) { 14820fbc74f4SHong Zhang if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */ 14830fbc74f4SHong Zhang caj[k] += (*aa)*(*(ba+nextb)); 1484bc011b1eSHong Zhang nextb++; 1485bc011b1eSHong Zhang } 1486bc011b1eSHong Zhang } 1487bc011b1eSHong Zhang flops += 2*bnzi; 14880fbc74f4SHong Zhang aa++; 1489bc011b1eSHong Zhang } 1490bc011b1eSHong Zhang } 1491bc011b1eSHong Zhang 1492bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 1493bc011b1eSHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1494bc011b1eSHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1495bc011b1eSHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 1496bc011b1eSHong Zhang PetscFunctionReturn(0); 1497bc011b1eSHong Zhang } 14989123193aSHong Zhang 14994222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 15009123193aSHong Zhang { 15019123193aSHong Zhang PetscErrorCode ierr; 15029123193aSHong Zhang 15039123193aSHong Zhang PetscFunctionBegin; 15045a586d82SBarry Smith ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 15054222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 15069123193aSHong Zhang PetscFunctionReturn(0); 15079123193aSHong Zhang } 15089123193aSHong Zhang 150993aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add) 15109123193aSHong Zhang { 1511f32d5d43SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1512612438f1SStefano Zampini Mat_SeqDense *bd=(Mat_SeqDense*)B->data; 15131ca9667aSStefano Zampini Mat_SeqDense *cd=(Mat_SeqDense*)C->data; 15149123193aSHong Zhang PetscErrorCode ierr; 15151ca9667aSStefano Zampini PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4; 1516a4af7ca8SStefano Zampini const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av; 1517daf57211SHong Zhang const PetscInt *aj; 1518612438f1SStefano Zampini PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n; 15191ca9667aSStefano Zampini PetscInt clda=cd->lda; 15201ca9667aSStefano Zampini PetscInt am4=4*clda,bm4=4*bm,col,i,j,n; 15219123193aSHong Zhang 15229123193aSHong Zhang PetscFunctionBegin; 1523f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 1524a4af7ca8SStefano Zampini ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr); 152593aa15f2SStefano Zampini if (add) { 15268c778c55SBarry Smith ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 152793aa15f2SStefano Zampini } else { 152893aa15f2SStefano Zampini ierr = MatDenseGetArrayWrite(C,&c);CHKERRQ(ierr); 152993aa15f2SStefano Zampini } 1530a4af7ca8SStefano Zampini ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1531f32d5d43SBarry Smith b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm; 15321ca9667aSStefano Zampini c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda; 15331ca9667aSStefano Zampini for (col=0; col<(cn/4)*4; col += 4) { /* over columns of C */ 15341ca9667aSStefano Zampini for (i=0; i<am; i++) { /* over rows of A in those columns */ 1535f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1536f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1537f32d5d43SBarry Smith aj = a->j + a->i[i]; 1538a4af7ca8SStefano Zampini aa = av + a->i[i]; 1539f32d5d43SBarry Smith for (j=0; j<n; j++) { 15401ca9667aSStefano Zampini const PetscScalar aatmp = aa[j]; 15411ca9667aSStefano Zampini const PetscInt ajtmp = aj[j]; 1542730858b9SHong Zhang r1 += aatmp*b1[ajtmp]; 1543730858b9SHong Zhang r2 += aatmp*b2[ajtmp]; 1544730858b9SHong Zhang r3 += aatmp*b3[ajtmp]; 1545730858b9SHong Zhang r4 += aatmp*b4[ajtmp]; 15469123193aSHong Zhang } 154793aa15f2SStefano Zampini if (add) { 154887753ddeSHong Zhang c1[i] += r1; 154987753ddeSHong Zhang c2[i] += r2; 155087753ddeSHong Zhang c3[i] += r3; 155187753ddeSHong Zhang c4[i] += r4; 155293aa15f2SStefano Zampini } else { 155393aa15f2SStefano Zampini c1[i] = r1; 155493aa15f2SStefano Zampini c2[i] = r2; 155593aa15f2SStefano Zampini c3[i] = r3; 155693aa15f2SStefano Zampini c4[i] = r4; 155793aa15f2SStefano Zampini } 1558f32d5d43SBarry Smith } 1559730858b9SHong Zhang b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4; 1560730858b9SHong Zhang c1 += am4; c2 += am4; c3 += am4; c4 += am4; 1561f32d5d43SBarry Smith } 156293aa15f2SStefano Zampini /* process remaining columns */ 156393aa15f2SStefano Zampini if (col != cn) { 156493aa15f2SStefano Zampini PetscInt rc = cn-col; 156593aa15f2SStefano Zampini 156693aa15f2SStefano Zampini if (rc == 1) { 156793aa15f2SStefano Zampini for (i=0; i<am; i++) { 1568f32d5d43SBarry Smith r1 = 0.0; 1569f32d5d43SBarry Smith n = a->i[i+1] - a->i[i]; 1570f32d5d43SBarry Smith aj = a->j + a->i[i]; 1571a4af7ca8SStefano Zampini aa = av + a->i[i]; 157293aa15f2SStefano Zampini for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]]; 157393aa15f2SStefano Zampini if (add) c1[i] += r1; 157493aa15f2SStefano Zampini else c1[i] = r1; 157593aa15f2SStefano Zampini } 157693aa15f2SStefano Zampini } else if (rc == 2) { 157793aa15f2SStefano Zampini for (i=0; i<am; i++) { 157893aa15f2SStefano Zampini r1 = r2 = 0.0; 157993aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 158093aa15f2SStefano Zampini aj = a->j + a->i[i]; 158193aa15f2SStefano Zampini aa = av + a->i[i]; 1582f32d5d43SBarry Smith for (j=0; j<n; j++) { 158393aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 158493aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 158593aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 158693aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 1587f32d5d43SBarry Smith } 158893aa15f2SStefano Zampini if (add) { 158987753ddeSHong Zhang c1[i] += r1; 159093aa15f2SStefano Zampini c2[i] += r2; 159193aa15f2SStefano Zampini } else { 159293aa15f2SStefano Zampini c1[i] = r1; 159393aa15f2SStefano Zampini c2[i] = r2; 1594f32d5d43SBarry Smith } 159593aa15f2SStefano Zampini } 159693aa15f2SStefano Zampini } else { 159793aa15f2SStefano Zampini for (i=0; i<am; i++) { 159893aa15f2SStefano Zampini r1 = r2 = r3 = 0.0; 159993aa15f2SStefano Zampini n = a->i[i+1] - a->i[i]; 160093aa15f2SStefano Zampini aj = a->j + a->i[i]; 160193aa15f2SStefano Zampini aa = av + a->i[i]; 160293aa15f2SStefano Zampini for (j=0; j<n; j++) { 160393aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 160493aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 160593aa15f2SStefano Zampini r1 += aatmp*b1[ajtmp]; 160693aa15f2SStefano Zampini r2 += aatmp*b2[ajtmp]; 160793aa15f2SStefano Zampini r3 += aatmp*b3[ajtmp]; 160893aa15f2SStefano Zampini } 160993aa15f2SStefano Zampini if (add) { 161093aa15f2SStefano Zampini c1[i] += r1; 161193aa15f2SStefano Zampini c2[i] += r2; 161293aa15f2SStefano Zampini c3[i] += r3; 161393aa15f2SStefano Zampini } else { 161493aa15f2SStefano Zampini c1[i] = r1; 161593aa15f2SStefano Zampini c2[i] = r2; 161693aa15f2SStefano Zampini c3[i] = r3; 161793aa15f2SStefano Zampini } 161893aa15f2SStefano Zampini } 161993aa15f2SStefano Zampini } 1620f32d5d43SBarry Smith } 1621dc0b31edSSatish Balay ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr); 162293aa15f2SStefano Zampini if (add) { 16238c778c55SBarry Smith ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 162493aa15f2SStefano Zampini } else { 162593aa15f2SStefano Zampini ierr = MatDenseRestoreArrayWrite(C,&c);CHKERRQ(ierr); 162693aa15f2SStefano Zampini } 1627a4af7ca8SStefano Zampini ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 1628a4af7ca8SStefano Zampini ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr); 1629f32d5d43SBarry Smith PetscFunctionReturn(0); 1630f32d5d43SBarry Smith } 1631f32d5d43SBarry Smith 163287753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C) 1633f32d5d43SBarry Smith { 1634f32d5d43SBarry Smith PetscErrorCode ierr; 1635f32d5d43SBarry Smith 1636f32d5d43SBarry Smith PetscFunctionBegin; 16372c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 16382c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->rmap->n != C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 16392c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->cmap->n != C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 16404324174eSBarry Smith 164193aa15f2SStefano Zampini ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);CHKERRQ(ierr); 16429123193aSHong Zhang PetscFunctionReturn(0); 16439123193aSHong Zhang } 1644b1683b59SHong Zhang 16454222ddf1SHong Zhang /* ------------------------------------------------------- */ 16464222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 16474222ddf1SHong Zhang { 16484222ddf1SHong Zhang PetscFunctionBegin; 16494222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 16504222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16514222ddf1SHong Zhang PetscFunctionReturn(0); 16524222ddf1SHong Zhang } 16534222ddf1SHong Zhang 16546718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat); 16556718818eSStefano Zampini 16564222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 16574222ddf1SHong Zhang { 16584222ddf1SHong Zhang PetscFunctionBegin; 16596718818eSStefano Zampini C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16604222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB; 16616718818eSStefano Zampini PetscFunctionReturn(0); 16626718818eSStefano Zampini } 16636718818eSStefano Zampini 16646718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 16656718818eSStefano Zampini { 16666718818eSStefano Zampini PetscFunctionBegin; 16676718818eSStefano Zampini C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16686718818eSStefano Zampini C->ops->productsymbolic = MatProductSymbolic_ABt; 16694222ddf1SHong Zhang PetscFunctionReturn(0); 16704222ddf1SHong Zhang } 16714222ddf1SHong Zhang 16724222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 16734222ddf1SHong Zhang { 16744222ddf1SHong Zhang PetscErrorCode ierr; 16754222ddf1SHong Zhang Mat_Product *product = C->product; 16764222ddf1SHong Zhang 16774222ddf1SHong Zhang PetscFunctionBegin; 16784222ddf1SHong Zhang switch (product->type) { 16794222ddf1SHong Zhang case MATPRODUCT_AB: 16804222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr); 16814222ddf1SHong Zhang break; 16824222ddf1SHong Zhang case MATPRODUCT_AtB: 16834222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr); 16844222ddf1SHong Zhang break; 16856718818eSStefano Zampini case MATPRODUCT_ABt: 16866718818eSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr); 16874222ddf1SHong Zhang break; 16884222ddf1SHong Zhang default: 16896718818eSStefano Zampini break; 16904222ddf1SHong Zhang } 16914222ddf1SHong Zhang PetscFunctionReturn(0); 16924222ddf1SHong Zhang } 16934222ddf1SHong Zhang /* ------------------------------------------------------- */ 16944222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 16954222ddf1SHong Zhang { 16964222ddf1SHong Zhang PetscErrorCode ierr; 16974222ddf1SHong Zhang Mat_Product *product = C->product; 16984222ddf1SHong Zhang Mat A = product->A; 16994222ddf1SHong Zhang PetscBool baij; 17004222ddf1SHong Zhang 17014222ddf1SHong Zhang PetscFunctionBegin; 17024222ddf1SHong Zhang ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr); 17034222ddf1SHong Zhang if (!baij) { /* A is seqsbaij */ 17044222ddf1SHong Zhang PetscBool sbaij; 17054222ddf1SHong Zhang ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr); 17062c71b3e2SJacob Faibussowitsch PetscCheckFalse(!sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format"); 17074222ddf1SHong Zhang 17084222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 17094222ddf1SHong Zhang } else { /* A is seqbaij */ 17104222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 17114222ddf1SHong Zhang } 17124222ddf1SHong Zhang 17134222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17144222ddf1SHong Zhang PetscFunctionReturn(0); 17154222ddf1SHong Zhang } 17164222ddf1SHong Zhang 17174222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 17184222ddf1SHong Zhang { 17194222ddf1SHong Zhang PetscErrorCode ierr; 17204222ddf1SHong Zhang Mat_Product *product = C->product; 17214222ddf1SHong Zhang 17224222ddf1SHong Zhang PetscFunctionBegin; 17236718818eSStefano Zampini MatCheckProduct(C,1); 17242c71b3e2SJacob Faibussowitsch PetscCheckFalse(!product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 17256718818eSStefano Zampini if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) { 17264222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr); 17276718818eSStefano Zampini } 17284222ddf1SHong Zhang PetscFunctionReturn(0); 17294222ddf1SHong Zhang } 17306718818eSStefano Zampini 17314222ddf1SHong Zhang /* ------------------------------------------------------- */ 17324222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 17334222ddf1SHong Zhang { 17344222ddf1SHong Zhang PetscFunctionBegin; 17354222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 17364222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17374222ddf1SHong Zhang PetscFunctionReturn(0); 17384222ddf1SHong Zhang } 17394222ddf1SHong Zhang 17404222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 17414222ddf1SHong Zhang { 17424222ddf1SHong Zhang PetscErrorCode ierr; 17434222ddf1SHong Zhang Mat_Product *product = C->product; 17444222ddf1SHong Zhang 17454222ddf1SHong Zhang PetscFunctionBegin; 17464222ddf1SHong Zhang if (product->type == MATPRODUCT_AB) { 17474222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr); 17486718818eSStefano Zampini } 17494222ddf1SHong Zhang PetscFunctionReturn(0); 17504222ddf1SHong Zhang } 17514222ddf1SHong Zhang /* ------------------------------------------------------- */ 17524222ddf1SHong Zhang 1753b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense) 1754c8db22f6SHong Zhang { 1755c8db22f6SHong Zhang PetscErrorCode ierr; 17562f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 17572f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data; 17582f5f1f90SHong Zhang PetscInt *bi = b->i,*bj=b->j; 17592f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns; 17602f5f1f90SHong Zhang MatScalar *btval,*btval_den,*ba=b->a; 17612f5f1f90SHong Zhang PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors; 1762c8db22f6SHong Zhang 1763c8db22f6SHong Zhang PetscFunctionBegin; 17642f5f1f90SHong Zhang btval_den=btdense->v; 1765580bdb30SBarry Smith ierr = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr); 17662f5f1f90SHong Zhang for (k=0; k<ncolors; k++) { 17672f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 17682f5f1f90SHong Zhang for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1769d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 17702f5f1f90SHong Zhang btcol = bj + bi[col]; 17712f5f1f90SHong Zhang btval = ba + bi[col]; 17722f5f1f90SHong Zhang anz = bi[col+1] - bi[col]; 1773d2853540SHong Zhang for (j=0; j<anz; j++) { 17742f5f1f90SHong Zhang brow = btcol[j]; 17752f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1776c8db22f6SHong Zhang } 1777c8db22f6SHong Zhang } 17782f5f1f90SHong Zhang btval_den += m; 1779c8db22f6SHong Zhang } 1780c8db22f6SHong Zhang PetscFunctionReturn(0); 1781c8db22f6SHong Zhang } 1782c8db22f6SHong Zhang 1783b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp) 17848972f759SHong Zhang { 17858972f759SHong Zhang PetscErrorCode ierr; 1786b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data; 17871683a169SBarry Smith const PetscScalar *ca_den,*ca_den_ptr; 17881683a169SBarry Smith PetscScalar *ca=csp->a; 1789f99a636bSHong Zhang PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors; 1790e88777f2SHong Zhang PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp; 1791077f23c2SHong Zhang PetscInt nrows,*row,*idx; 1792077f23c2SHong Zhang PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow; 17938972f759SHong Zhang 17948972f759SHong Zhang PetscFunctionBegin; 17951683a169SBarry Smith ierr = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1796f99a636bSHong Zhang 1797077f23c2SHong Zhang if (brows > 0) { 1798077f23c2SHong Zhang PetscInt *lstart,row_end,row_start; 1799980ae229SHong Zhang lstart = matcoloring->lstart; 1800580bdb30SBarry Smith ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr); 1801eeb4fd02SHong Zhang 1802077f23c2SHong Zhang row_end = brows; 1803eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1804077f23c2SHong Zhang for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */ 1805077f23c2SHong Zhang ca_den_ptr = ca_den; 1806980ae229SHong Zhang for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */ 1807eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1808eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1809eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1810eeb4fd02SHong Zhang for (l=lstart[k]; l<nrows; l++) { 1811eeb4fd02SHong Zhang if (row[l] >= row_end) { 1812eeb4fd02SHong Zhang lstart[k] = l; 1813eeb4fd02SHong Zhang break; 1814eeb4fd02SHong Zhang } else { 1815077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1816eeb4fd02SHong Zhang } 1817eeb4fd02SHong Zhang } 1818077f23c2SHong Zhang ca_den_ptr += m; 1819eeb4fd02SHong Zhang } 1820077f23c2SHong Zhang row_end += brows; 1821eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1822eeb4fd02SHong Zhang } 1823077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1824077f23c2SHong Zhang ca_den_ptr = ca_den; 1825077f23c2SHong Zhang for (k=0; k<ncolors; k++) { 1826077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1827077f23c2SHong Zhang row = rows + colorforrow[k]; 1828077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1829077f23c2SHong Zhang for (l=0; l<nrows; l++) { 1830077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1831077f23c2SHong Zhang } 1832077f23c2SHong Zhang ca_den_ptr += m; 1833077f23c2SHong Zhang } 1834f99a636bSHong Zhang } 1835f99a636bSHong Zhang 18361683a169SBarry Smith ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr); 1837f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1838077f23c2SHong Zhang if (matcoloring->brows > 0) { 18397d3de750SJacob Faibussowitsch ierr = PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows);CHKERRQ(ierr); 1840e88777f2SHong Zhang } else { 1841077f23c2SHong Zhang ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr); 1842e88777f2SHong Zhang } 1843f99a636bSHong Zhang #endif 18448972f759SHong Zhang PetscFunctionReturn(0); 18458972f759SHong Zhang } 18468972f759SHong Zhang 1847b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c) 1848b1683b59SHong Zhang { 1849b1683b59SHong Zhang PetscErrorCode ierr; 1850e88777f2SHong Zhang PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm; 18511a83f524SJed Brown const PetscInt *is,*ci,*cj,*row_idx; 1852b28d80bdSHong Zhang PetscInt nis = iscoloring->n,*rowhit,bs = 1; 1853b1683b59SHong Zhang IS *isa; 1854b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data; 1855e88777f2SHong Zhang PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i; 1856e88777f2SHong Zhang PetscInt *colorforcol,*columns,*columns_i,brows; 1857e88777f2SHong Zhang PetscBool flg; 1858b1683b59SHong Zhang 1859b1683b59SHong Zhang PetscFunctionBegin; 1860071fcb05SBarry Smith ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr); 1861e99be685SHong Zhang 18627ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1863e88777f2SHong Zhang Nbs = mat->cmap->N/bs; 1864b1683b59SHong Zhang c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */ 1865e88777f2SHong Zhang c->N = Nbs; 1866e88777f2SHong Zhang c->m = c->M; 1867b1683b59SHong Zhang c->rstart = 0; 1868e88777f2SHong Zhang c->brows = 100; 1869b1683b59SHong Zhang 1870b1683b59SHong Zhang c->ncolors = nis; 1871dcca6d9dSJed Brown ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr); 1872854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr); 1873854ce69bSBarry Smith ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr); 1874e88777f2SHong Zhang 1875e88777f2SHong Zhang brows = c->brows; 1876c5929fdfSBarry Smith ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr); 1877e88777f2SHong Zhang if (flg) c->brows = brows; 1878eeb4fd02SHong Zhang if (brows > 0) { 1879854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr); 1880e88777f2SHong Zhang } 18812205254eSKarl Rupp 1882d2853540SHong Zhang colorforrow[0] = 0; 1883d2853540SHong Zhang rows_i = rows; 1884f99a636bSHong Zhang den2sp_i = den2sp; 1885b1683b59SHong Zhang 1886854ce69bSBarry Smith ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr); 1887854ce69bSBarry Smith ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr); 18882205254eSKarl Rupp 1889d2853540SHong Zhang colorforcol[0] = 0; 1890d2853540SHong Zhang columns_i = columns; 1891d2853540SHong Zhang 1892077f23c2SHong Zhang /* get column-wise storage of mat */ 1893077f23c2SHong Zhang ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1894b1683b59SHong Zhang 1895b28d80bdSHong Zhang cm = c->m; 1896854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr); 1897854ce69bSBarry Smith ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr); 1898f99a636bSHong Zhang for (i=0; i<nis; i++) { /* loop over color */ 1899b1683b59SHong Zhang ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr); 1900b1683b59SHong Zhang ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr); 19012205254eSKarl Rupp 1902b1683b59SHong Zhang c->ncolumns[i] = n; 1903b1683b59SHong Zhang if (n) { 1904580bdb30SBarry Smith ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr); 1905b1683b59SHong Zhang } 1906d2853540SHong Zhang colorforcol[i+1] = colorforcol[i] + n; 1907d2853540SHong Zhang columns_i += n; 1908b1683b59SHong Zhang 1909b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 1910580bdb30SBarry Smith ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr); 1911e99be685SHong Zhang 1912b7caf3d6SHong Zhang for (j=0; j<n; j++) { /* loop over columns*/ 1913b1683b59SHong Zhang col = is[j]; 1914d2853540SHong Zhang row_idx = cj + ci[col]; 1915b1683b59SHong Zhang m = ci[col+1] - ci[col]; 1916b7caf3d6SHong Zhang for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */ 1917e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1918d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1919b1683b59SHong Zhang } 1920b1683b59SHong Zhang } 1921b1683b59SHong Zhang /* count the number of hits */ 1922b1683b59SHong Zhang nrows = 0; 1923e8354b3eSHong Zhang for (j=0; j<cm; j++) { 1924b1683b59SHong Zhang if (rowhit[j]) nrows++; 1925b1683b59SHong Zhang } 1926b1683b59SHong Zhang c->nrows[i] = nrows; 1927d2853540SHong Zhang colorforrow[i+1] = colorforrow[i] + nrows; 1928d2853540SHong Zhang 1929b1683b59SHong Zhang nrows = 0; 1930b7caf3d6SHong Zhang for (j=0; j<cm; j++) { /* loop over rows */ 1931b1683b59SHong Zhang if (rowhit[j]) { 1932d2853540SHong Zhang rows_i[nrows] = j; 193312b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1934b1683b59SHong Zhang nrows++; 1935b1683b59SHong Zhang } 1936b1683b59SHong Zhang } 1937e88777f2SHong Zhang den2sp_i += nrows; 1938077f23c2SHong Zhang 1939b1683b59SHong Zhang ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr); 1940f99a636bSHong Zhang rows_i += nrows; 1941b1683b59SHong Zhang } 19420298fd71SBarry Smith ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); 1943b28d80bdSHong Zhang ierr = PetscFree(rowhit);CHKERRQ(ierr); 1944071fcb05SBarry Smith ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr); 19452c71b3e2SJacob Faibussowitsch PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]); 1946b28d80bdSHong Zhang 1947d2853540SHong Zhang c->colorforrow = colorforrow; 1948d2853540SHong Zhang c->rows = rows; 1949f99a636bSHong Zhang c->den2sp = den2sp; 1950d2853540SHong Zhang c->colorforcol = colorforcol; 1951d2853540SHong Zhang c->columns = columns; 19522205254eSKarl Rupp 1953f94ccd6cSHong Zhang ierr = PetscFree(idxhit);CHKERRQ(ierr); 1954b1683b59SHong Zhang PetscFunctionReturn(0); 1955b1683b59SHong Zhang } 1956d013fc79Sandi selinger 19574222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19584222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1959df97dc6dSFande Kong { 19604222ddf1SHong Zhang PetscErrorCode ierr; 19614222ddf1SHong Zhang Mat_Product *product = C->product; 19624222ddf1SHong Zhang Mat A=product->A,B=product->B; 19634222ddf1SHong Zhang 1964df97dc6dSFande Kong PetscFunctionBegin; 19654222ddf1SHong Zhang if (C->ops->mattransposemultnumeric) { 19664222ddf1SHong Zhang /* Alg: "outerproduct" */ 19676718818eSStefano Zampini ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr); 19684222ddf1SHong Zhang } else { 19694222ddf1SHong Zhang /* Alg: "matmatmult" -- C = At*B */ 19706718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 19716718818eSStefano Zampini Mat At; 19724222ddf1SHong Zhang 19732c71b3e2SJacob Faibussowitsch PetscCheckFalse(!atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct"); 19746718818eSStefano Zampini At = atb->At; 1975089a957eSStefano Zampini if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */ 19764222ddf1SHong Zhang ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr); 19774222ddf1SHong Zhang } 1978089a957eSStefano Zampini ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);CHKERRQ(ierr); 19794222ddf1SHong Zhang atb->updateAt = PETSC_TRUE; 19804222ddf1SHong Zhang } 1981df97dc6dSFande Kong PetscFunctionReturn(0); 1982df97dc6dSFande Kong } 1983df97dc6dSFande Kong 19844222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1985d013fc79Sandi selinger { 1986d013fc79Sandi selinger PetscErrorCode ierr; 19874222ddf1SHong Zhang Mat_Product *product = C->product; 19884222ddf1SHong Zhang Mat A=product->A,B=product->B; 19894222ddf1SHong Zhang PetscReal fill=product->fill; 1990d013fc79Sandi selinger 1991d013fc79Sandi selinger PetscFunctionBegin; 19924222ddf1SHong Zhang ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr); 19932869b61bSandi selinger 19944222ddf1SHong Zhang C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 19954222ddf1SHong Zhang PetscFunctionReturn(0); 19962869b61bSandi selinger } 1997d013fc79Sandi selinger 19984222ddf1SHong Zhang /* --------------------------------------------------------------- */ 19994222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 20004222ddf1SHong Zhang { 20014222ddf1SHong Zhang PetscErrorCode ierr; 20024222ddf1SHong Zhang Mat_Product *product = C->product; 20034222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20044222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20054222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 20064222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 20074222ddf1SHong Zhang PetscInt nalg = 7; 20084222ddf1SHong Zhang #else 20094222ddf1SHong Zhang const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"}; 20104222ddf1SHong Zhang PetscInt nalg = 8; 20114222ddf1SHong Zhang #endif 20124222ddf1SHong Zhang 20134222ddf1SHong Zhang PetscFunctionBegin; 20144222ddf1SHong Zhang /* Set default algorithm */ 20154222ddf1SHong Zhang ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 20164222ddf1SHong Zhang if (flg) { 20174222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2018d013fc79Sandi selinger } 2019d013fc79Sandi selinger 20204222ddf1SHong Zhang /* Get runtime option */ 20214222ddf1SHong Zhang if (product->api_user) { 20224222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 20234222ddf1SHong Zhang ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 20244222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20254222ddf1SHong Zhang } else { 20264222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 20273e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 20284222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 2029d013fc79Sandi selinger } 20304222ddf1SHong Zhang if (flg) { 20314222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 2032d013fc79Sandi selinger } 2033d013fc79Sandi selinger 20344222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 20354222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 20364222ddf1SHong Zhang PetscFunctionReturn(0); 20374222ddf1SHong Zhang } 2038d013fc79Sandi selinger 20394222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 20404222ddf1SHong Zhang { 20414222ddf1SHong Zhang PetscErrorCode ierr; 20424222ddf1SHong Zhang Mat_Product *product = C->product; 20434222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20444222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 2045089a957eSStefano Zampini const char *algTypes[3] = {"default","at*b","outerproduct"}; 2046089a957eSStefano Zampini PetscInt nalg = 3; 2047d013fc79Sandi selinger 20484222ddf1SHong Zhang PetscFunctionBegin; 20494222ddf1SHong Zhang /* Get runtime option */ 20504222ddf1SHong Zhang if (product->api_user) { 20514222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 20524222ddf1SHong Zhang ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20534222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20544222ddf1SHong Zhang } else { 20554222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 20563e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20574222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20584222ddf1SHong Zhang } 20594222ddf1SHong Zhang if (flg) { 20604222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20614222ddf1SHong Zhang } 2062d013fc79Sandi selinger 20634222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 20644222ddf1SHong Zhang PetscFunctionReturn(0); 20654222ddf1SHong Zhang } 20664222ddf1SHong Zhang 20674222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 20684222ddf1SHong Zhang { 20694222ddf1SHong Zhang PetscErrorCode ierr; 20704222ddf1SHong Zhang Mat_Product *product = C->product; 20714222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20724222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20734222ddf1SHong Zhang const char *algTypes[2] = {"default","color"}; 20744222ddf1SHong Zhang PetscInt nalg = 2; 20754222ddf1SHong Zhang 20764222ddf1SHong Zhang PetscFunctionBegin; 20774222ddf1SHong Zhang /* Set default algorithm */ 20784222ddf1SHong Zhang ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr); 20794222ddf1SHong Zhang if (!flg) { 20804222ddf1SHong Zhang alg = 1; 20814222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20824222ddf1SHong Zhang } 20834222ddf1SHong Zhang 20844222ddf1SHong Zhang /* Get runtime option */ 20854222ddf1SHong Zhang if (product->api_user) { 20864222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr); 20874222ddf1SHong Zhang ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20884222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20894222ddf1SHong Zhang } else { 20904222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr); 20913e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 20924222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 20934222ddf1SHong Zhang } 20944222ddf1SHong Zhang if (flg) { 20954222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 20964222ddf1SHong Zhang } 20974222ddf1SHong Zhang 20984222ddf1SHong Zhang C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 20994222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABt; 21004222ddf1SHong Zhang PetscFunctionReturn(0); 21014222ddf1SHong Zhang } 21024222ddf1SHong Zhang 21034222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 21044222ddf1SHong Zhang { 21054222ddf1SHong Zhang PetscErrorCode ierr; 21064222ddf1SHong Zhang Mat_Product *product = C->product; 21074222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21084222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 21094222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 21104222ddf1SHong Zhang const char *algTypes[2] = {"scalable","rap"}; 21114222ddf1SHong Zhang PetscInt nalg = 2; 21124222ddf1SHong Zhang #else 21134222ddf1SHong Zhang const char *algTypes[3] = {"scalable","rap","hypre"}; 21144222ddf1SHong Zhang PetscInt nalg = 3; 21154222ddf1SHong Zhang #endif 21164222ddf1SHong Zhang 21174222ddf1SHong Zhang PetscFunctionBegin; 21184222ddf1SHong Zhang /* Set default algorithm */ 21194222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21204222ddf1SHong Zhang if (flg) { 21214222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21224222ddf1SHong Zhang } 21234222ddf1SHong Zhang 21244222ddf1SHong Zhang /* Get runtime option */ 21254222ddf1SHong Zhang if (product->api_user) { 21264222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 21274222ddf1SHong Zhang ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21284222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21294222ddf1SHong Zhang } else { 21304222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 21313e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21324222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21334222ddf1SHong Zhang } 21344222ddf1SHong Zhang if (flg) { 21354222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21364222ddf1SHong Zhang } 21374222ddf1SHong Zhang 21384222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 21394222ddf1SHong Zhang PetscFunctionReturn(0); 21404222ddf1SHong Zhang } 21414222ddf1SHong Zhang 21424222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 21434222ddf1SHong Zhang { 21444222ddf1SHong Zhang PetscErrorCode ierr; 21454222ddf1SHong Zhang Mat_Product *product = C->product; 21464222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21474222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21484222ddf1SHong Zhang const char *algTypes[3] = {"r*a*rt","r*art","coloring_rart"}; 21494222ddf1SHong Zhang PetscInt nalg = 3; 21504222ddf1SHong Zhang 21514222ddf1SHong Zhang PetscFunctionBegin; 21524222ddf1SHong Zhang /* Set default algorithm */ 21534222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21544222ddf1SHong Zhang if (flg) { 21554222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21564222ddf1SHong Zhang } 21574222ddf1SHong Zhang 21584222ddf1SHong Zhang /* Get runtime option */ 21594222ddf1SHong Zhang if (product->api_user) { 21604222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 21614222ddf1SHong Zhang ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21624222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21634222ddf1SHong Zhang } else { 21644222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 21653e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr); 21664222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21674222ddf1SHong Zhang } 21684222ddf1SHong Zhang if (flg) { 21694222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21704222ddf1SHong Zhang } 21714222ddf1SHong Zhang 21724222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 21734222ddf1SHong Zhang PetscFunctionReturn(0); 21744222ddf1SHong Zhang } 21754222ddf1SHong Zhang 21764222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 21774222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 21784222ddf1SHong Zhang { 21794222ddf1SHong Zhang PetscErrorCode ierr; 21804222ddf1SHong Zhang Mat_Product *product = C->product; 21814222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21824222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21834222ddf1SHong Zhang const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"}; 21844222ddf1SHong Zhang PetscInt nalg = 7; 21854222ddf1SHong Zhang 21864222ddf1SHong Zhang PetscFunctionBegin; 21874222ddf1SHong Zhang /* Set default algorithm */ 21884222ddf1SHong Zhang ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr); 21894222ddf1SHong Zhang if (flg) { 21904222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 21914222ddf1SHong Zhang } 21924222ddf1SHong Zhang 21934222ddf1SHong Zhang /* Get runtime option */ 21944222ddf1SHong Zhang if (product->api_user) { 21954222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 21964222ddf1SHong Zhang ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 21974222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 21984222ddf1SHong Zhang } else { 21994222ddf1SHong Zhang ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 22003e662e0bSHong Zhang ierr = PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 22014222ddf1SHong Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 22024222ddf1SHong Zhang } 22034222ddf1SHong Zhang if (flg) { 22044222ddf1SHong Zhang ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr); 22054222ddf1SHong Zhang } 22064222ddf1SHong Zhang 22074222ddf1SHong Zhang C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 22084222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABC; 22094222ddf1SHong Zhang PetscFunctionReturn(0); 22104222ddf1SHong Zhang } 22114222ddf1SHong Zhang 22124222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 22134222ddf1SHong Zhang { 22144222ddf1SHong Zhang PetscErrorCode ierr; 22154222ddf1SHong Zhang Mat_Product *product = C->product; 22164222ddf1SHong Zhang 22174222ddf1SHong Zhang PetscFunctionBegin; 22184222ddf1SHong Zhang switch (product->type) { 22194222ddf1SHong Zhang case MATPRODUCT_AB: 22204222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr); 22214222ddf1SHong Zhang break; 22224222ddf1SHong Zhang case MATPRODUCT_AtB: 22234222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr); 22244222ddf1SHong Zhang break; 22254222ddf1SHong Zhang case MATPRODUCT_ABt: 22264222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr); 22274222ddf1SHong Zhang break; 22284222ddf1SHong Zhang case MATPRODUCT_PtAP: 22294222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr); 22304222ddf1SHong Zhang break; 22314222ddf1SHong Zhang case MATPRODUCT_RARt: 22324222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr); 22334222ddf1SHong Zhang break; 22344222ddf1SHong Zhang case MATPRODUCT_ABC: 22354222ddf1SHong Zhang ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr); 22364222ddf1SHong Zhang break; 22376718818eSStefano Zampini default: 22386718818eSStefano Zampini break; 22394222ddf1SHong Zhang } 2240d013fc79Sandi selinger PetscFunctionReturn(0); 2241d013fc79Sandi selinger } 2242