1d50806bdSBarry Smith /* 22499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 3d50806bdSBarry Smith C = A * B 4d50806bdSBarry Smith */ 5d50806bdSBarry Smith 6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 8c6db04a5SJed Brown #include <petscbt.h> 9af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1007475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 117bab7c10SHong Zhang 12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 13d71ae5a4SJacob Faibussowitsch { 14df97dc6dSFande Kong PetscFunctionBegin; 15dbbe0bcdSBarry Smith if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16dbbe0bcdSBarry Smith else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18df97dc6dSFande Kong } 19df97dc6dSFande Kong 204222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */ 21d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 22d71ae5a4SJacob Faibussowitsch { 234222ddf1SHong Zhang PetscInt ii; 244222ddf1SHong Zhang Mat_SeqAIJ *aij; 259f0612e4SBarry Smith PetscBool isseqaij, ofree_a, ofree_ij; 265c66b693SKris Buschelman 275c66b693SKris Buschelman PetscFunctionBegin; 28aed4548fSBarry Smith PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 299566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mat, m, n, m, n)); 304222ddf1SHong Zhang 31e4e71118SStefano Zampini if (!mtype) { 329566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 339566063dSJacob Faibussowitsch if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 34e4e71118SStefano Zampini } else { 359566063dSJacob Faibussowitsch PetscCall(MatSetType(mat, mtype)); 36e4e71118SStefano Zampini } 37cbc6b225SStefano Zampini 38*57508eceSPierre Jolivet aij = (Mat_SeqAIJ *)mat->data; 39cbc6b225SStefano Zampini ofree_a = aij->free_a; 40cbc6b225SStefano Zampini ofree_ij = aij->free_ij; 41cbc6b225SStefano Zampini /* changes the free flags */ 429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 43cbc6b225SStefano Zampini 449566063dSJacob Faibussowitsch PetscCall(PetscFree(aij->ilen)); 459566063dSJacob Faibussowitsch PetscCall(PetscFree(aij->imax)); 469566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aij->imax)); 479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aij->ilen)); 48cbc6b225SStefano Zampini for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 49cbc6b225SStefano Zampini const PetscInt rnz = i[ii + 1] - i[ii]; 50cbc6b225SStefano Zampini aij->nonzerorowcnt += !!rnz; 51cbc6b225SStefano Zampini aij->rmax = PetscMax(aij->rmax, rnz); 52cbc6b225SStefano Zampini aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 53cbc6b225SStefano Zampini } 54cbc6b225SStefano Zampini aij->maxnz = i[m]; 55cbc6b225SStefano Zampini aij->nz = i[m]; 564222ddf1SHong Zhang 579f0612e4SBarry Smith if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a)); 589f0612e4SBarry Smith if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j)); 599f0612e4SBarry Smith if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i)); 609f0612e4SBarry Smith 614222ddf1SHong Zhang aij->i = i; 624222ddf1SHong Zhang aij->j = j; 634222ddf1SHong Zhang aij->a = a; 644222ddf1SHong Zhang aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 654222ddf1SHong Zhang aij->free_a = PETSC_FALSE; 664222ddf1SHong Zhang aij->free_ij = PETSC_FALSE; 679566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 686dc08606SJunchao Zhang // Always build the diag info when i, j are set 696dc08606SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(mat)); 703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 715c66b693SKris Buschelman } 721c24bd37SHong Zhang 73d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 74d71ae5a4SJacob Faibussowitsch { 754222ddf1SHong Zhang Mat_Product *product = C->product; 764222ddf1SHong Zhang MatProductAlgorithm alg; 774222ddf1SHong Zhang PetscBool flg; 784222ddf1SHong Zhang 794222ddf1SHong Zhang PetscFunctionBegin; 804222ddf1SHong Zhang if (product) { 814222ddf1SHong Zhang alg = product->alg; 824222ddf1SHong Zhang } else { 834222ddf1SHong Zhang alg = "sorted"; 844222ddf1SHong Zhang } 854222ddf1SHong Zhang /* sorted */ 869566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "sorted", &flg)); 874222ddf1SHong Zhang if (flg) { 889566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 904222ddf1SHong Zhang } 914222ddf1SHong Zhang 924222ddf1SHong Zhang /* scalable */ 939566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "scalable", &flg)); 944222ddf1SHong Zhang if (flg) { 959566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 974222ddf1SHong Zhang } 984222ddf1SHong Zhang 994222ddf1SHong Zhang /* scalable_fast */ 1009566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 1014222ddf1SHong Zhang if (flg) { 1029566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 1033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1044222ddf1SHong Zhang } 1054222ddf1SHong Zhang 1064222ddf1SHong Zhang /* heap */ 1079566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "heap", &flg)); 1084222ddf1SHong Zhang if (flg) { 1099566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 1103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1114222ddf1SHong Zhang } 1124222ddf1SHong Zhang 1134222ddf1SHong Zhang /* btheap */ 1149566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "btheap", &flg)); 1154222ddf1SHong Zhang if (flg) { 1169566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 1173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1184222ddf1SHong Zhang } 1194222ddf1SHong Zhang 1204222ddf1SHong Zhang /* llcondensed */ 1219566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 1224222ddf1SHong Zhang if (flg) { 1239566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 1243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1254222ddf1SHong Zhang } 1264222ddf1SHong Zhang 1274222ddf1SHong Zhang /* rowmerge */ 1289566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 1294222ddf1SHong Zhang if (flg) { 1309566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 1313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1324222ddf1SHong Zhang } 1334222ddf1SHong Zhang 1344222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE) 1359566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "hypre", &flg)); 1364222ddf1SHong Zhang if (flg) { 1379566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 1383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1394222ddf1SHong Zhang } 1404222ddf1SHong Zhang #endif 1414222ddf1SHong Zhang 1424222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1434222ddf1SHong Zhang } 1444222ddf1SHong Zhang 145d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 146d71ae5a4SJacob Faibussowitsch { 147b561aa9dSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 148971236abSHong Zhang PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 149c1ba5575SJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 150b561aa9dSHong Zhang PetscReal afill; 151eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 152eec179cfSJacob Faibussowitsch PetscHMapI ta; 153fb908031SHong Zhang PetscBT lnkbt; 1540298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 155b561aa9dSHong Zhang 156b561aa9dSHong Zhang PetscFunctionBegin; 157bd958071SHong Zhang /* Get ci and cj */ 158fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 159fb908031SHong Zhang /* free space for accumulating nonzero column info */ 1609566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 161fb908031SHong Zhang ci[0] = 0; 162fb908031SHong Zhang 163fb908031SHong Zhang /* create and initialize a linked list */ 164eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 165c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 166eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 167eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 168eca6b86aSHong Zhang 1699566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 170fb908031SHong Zhang 171fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1729566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 1732205254eSKarl Rupp 174fb908031SHong Zhang current_space = free_space; 175fb908031SHong Zhang 176fb908031SHong Zhang /* Determine ci and cj */ 177fb908031SHong Zhang for (i = 0; i < am; i++) { 178fb908031SHong Zhang anzi = ai[i + 1] - ai[i]; 179fb908031SHong Zhang aj = a->j + ai[i]; 180fb908031SHong Zhang for (j = 0; j < anzi; j++) { 181fb908031SHong Zhang brow = aj[j]; 182fb908031SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 183fb908031SHong Zhang bj = b->j + bi[brow]; 184fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 1859566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 186fb908031SHong Zhang } 1878c78258cSHong Zhang /* add possible missing diagonal entry */ 18848a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 189fb908031SHong Zhang cnzi = lnk[0]; 190fb908031SHong Zhang 191fb908031SHong Zhang /* If free space is not available, make more free space */ 192fb908031SHong Zhang /* Double the amount of total space in the list */ 193fb908031SHong Zhang if (current_space->local_remaining < cnzi) { 1949566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 195fb908031SHong Zhang ndouble++; 196fb908031SHong Zhang } 197fb908031SHong Zhang 198fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 1999566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 2002205254eSKarl Rupp 201fb908031SHong Zhang current_space->array += cnzi; 202fb908031SHong Zhang current_space->local_used += cnzi; 203fb908031SHong Zhang current_space->local_remaining -= cnzi; 2042205254eSKarl Rupp 205fb908031SHong Zhang ci[i + 1] = ci[i] + cnzi; 206fb908031SHong Zhang } 207fb908031SHong Zhang 208fb908031SHong Zhang /* Column indices are in the list of free space */ 209fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 210fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 2119566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 2129566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 2139566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 214b561aa9dSHong Zhang 21526be0446SHong Zhang /* put together the new symbolic matrix */ 2169566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 2179566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 21858c24d83SHong Zhang 21958c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 22058c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 221f4f49eeaSPierre Jolivet c = (Mat_SeqAIJ *)C->data; 222aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 223e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 22458c24d83SHong Zhang c->nonew = 0; 2254222ddf1SHong Zhang 2264222ddf1SHong Zhang /* fast, needs non-scalable O(bn) array 'abdense' */ 2274222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 2280b7e3e3dSHong Zhang 2297212da7cSHong Zhang /* set MatInfo */ 2307212da7cSHong Zhang afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 231f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 2324222ddf1SHong Zhang C->info.mallocs = ndouble; 2334222ddf1SHong Zhang C->info.fill_ratio_given = fill; 2344222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 2357212da7cSHong Zhang 2367212da7cSHong Zhang #if defined(PETSC_USE_INFO) 2377212da7cSHong Zhang if (ci[am]) { 2389566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 2399566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 240f2b054eeSHong Zhang } else { 2419566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 242be0fcf8dSHong Zhang } 243f2b054eeSHong Zhang #endif 2443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24558c24d83SHong Zhang } 246d50806bdSBarry Smith 247d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 248d71ae5a4SJacob Faibussowitsch { 249d13dce4bSSatish Balay PetscLogDouble flops = 0.0; 250d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 251d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 252d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 25338baddfdSBarry Smith PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 254c58ca2e3SHong Zhang PetscInt am = A->rmap->n, cm = C->rmap->n; 255519eb980SHong Zhang PetscInt i, j, k, anzi, bnzi, cnzi, brow; 2562e5835c6SStefano Zampini PetscScalar *ca, valtmp; 257aa1aec99SHong Zhang PetscScalar *ab_dense; 2586718818eSStefano Zampini PetscContainer cab_dense; 2592e5835c6SStefano Zampini const PetscScalar *aa, *ba, *baj; 260d50806bdSBarry Smith 261d50806bdSBarry Smith PetscFunctionBegin; 2629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 2639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 264aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 2659566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 266aa1aec99SHong Zhang c->a = ca; 267aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 2686718818eSStefano Zampini } else ca = c->a; 2696718818eSStefano Zampini 2706718818eSStefano Zampini /* TODO this should be done in the symbolic phase */ 2716718818eSStefano Zampini /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 2726718818eSStefano Zampini that is hard to eradicate) */ 2739566063dSJacob Faibussowitsch PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 2746718818eSStefano Zampini if (!cab_dense) { 2759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 2769566063dSJacob Faibussowitsch PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 2779566063dSJacob Faibussowitsch PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 2789566063dSJacob Faibussowitsch PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 2799566063dSJacob Faibussowitsch PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 2809566063dSJacob Faibussowitsch PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 281d90d86d0SMatthew G. Knepley } 2829566063dSJacob Faibussowitsch PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 2839566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 284aa1aec99SHong Zhang 285c124e916SHong Zhang /* clean old values in C */ 2869566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 287d50806bdSBarry Smith /* Traverse A row-wise. */ 288d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 289d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 290d50806bdSBarry Smith for (i = 0; i < am; i++) { 291d50806bdSBarry Smith anzi = ai[i + 1] - ai[i]; 292d50806bdSBarry Smith for (j = 0; j < anzi; j++) { 293519eb980SHong Zhang brow = aj[j]; 294d50806bdSBarry Smith bnzi = bi[brow + 1] - bi[brow]; 2953b51c7ffSStefano Zampini bjj = PetscSafePointerPlusOffset(bj, bi[brow]); 2963b51c7ffSStefano Zampini baj = PetscSafePointerPlusOffset(ba, bi[brow]); 297519eb980SHong Zhang /* perform dense axpy */ 29836ec6d2dSHong Zhang valtmp = aa[j]; 299ad540459SPierre Jolivet for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 300519eb980SHong Zhang flops += 2 * bnzi; 301519eb980SHong Zhang } 3028e3a54c0SPierre Jolivet aj = PetscSafePointerPlusOffset(aj, anzi); 3038e3a54c0SPierre Jolivet aa = PetscSafePointerPlusOffset(aa, anzi); 304c58ca2e3SHong Zhang 305c58ca2e3SHong Zhang cnzi = ci[i + 1] - ci[i]; 306519eb980SHong Zhang for (k = 0; k < cnzi; k++) { 307519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 308519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 309519eb980SHong Zhang } 310519eb980SHong Zhang flops += cnzi; 3118e3a54c0SPierre Jolivet cj = PetscSafePointerPlusOffset(cj, cnzi); 3129371c9d4SSatish Balay ca += cnzi; 313519eb980SHong Zhang } 3142e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3152e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3162e5835c6SStefano Zampini #endif 3179566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 3189566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 3199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 3209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 3219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 3223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 323c58ca2e3SHong Zhang } 324c58ca2e3SHong Zhang 325d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 326d71ae5a4SJacob Faibussowitsch { 327c58ca2e3SHong Zhang PetscLogDouble flops = 0.0; 328c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 329c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 330c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 331c58ca2e3SHong Zhang PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 332c58ca2e3SHong Zhang PetscInt am = A->rmap->N, cm = C->rmap->N; 333c58ca2e3SHong Zhang PetscInt i, j, k, anzi, bnzi, cnzi, brow; 3342e5835c6SStefano Zampini PetscScalar *ca = c->a, valtmp; 3352e5835c6SStefano Zampini const PetscScalar *aa, *ba, *baj; 336c58ca2e3SHong Zhang PetscInt nextb; 337c58ca2e3SHong Zhang 338c58ca2e3SHong Zhang PetscFunctionBegin; 3399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 3409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 341cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 3429566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 343cf742fccSHong Zhang c->a = ca; 344cf742fccSHong Zhang c->free_a = PETSC_TRUE; 345cf742fccSHong Zhang } 346cf742fccSHong Zhang 347c58ca2e3SHong Zhang /* clean old values in C */ 3489566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 349c58ca2e3SHong Zhang /* Traverse A row-wise. */ 350c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 351c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 352519eb980SHong Zhang for (i = 0; i < am; i++) { 353519eb980SHong Zhang anzi = ai[i + 1] - ai[i]; 354519eb980SHong Zhang cnzi = ci[i + 1] - ci[i]; 355519eb980SHong Zhang for (j = 0; j < anzi; j++) { 356519eb980SHong Zhang brow = aj[j]; 357519eb980SHong Zhang bnzi = bi[brow + 1] - bi[brow]; 358519eb980SHong Zhang bjj = bj + bi[brow]; 359519eb980SHong Zhang baj = ba + bi[brow]; 360519eb980SHong Zhang /* perform sparse axpy */ 36136ec6d2dSHong Zhang valtmp = aa[j]; 362c124e916SHong Zhang nextb = 0; 363c124e916SHong Zhang for (k = 0; nextb < bnzi; k++) { 364c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 36536ec6d2dSHong Zhang ca[k] += valtmp * baj[nextb++]; 366c124e916SHong Zhang } 367d50806bdSBarry Smith } 368d50806bdSBarry Smith flops += 2 * bnzi; 369d50806bdSBarry Smith } 3709371c9d4SSatish Balay aj += anzi; 3719371c9d4SSatish Balay aa += anzi; 3729371c9d4SSatish Balay cj += cnzi; 3739371c9d4SSatish Balay ca += cnzi; 374d50806bdSBarry Smith } 3752e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3762e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3772e5835c6SStefano Zampini #endif 3789566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 3799566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 3809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 3819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 3829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 3833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 384d50806bdSBarry Smith } 385bc011b1eSHong Zhang 386d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 387d71ae5a4SJacob Faibussowitsch { 38825296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 38925296bd5SBarry Smith PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 3903c50cad2SHong Zhang PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 39125296bd5SBarry Smith MatScalar *ca; 39225296bd5SBarry Smith PetscReal afill; 393eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 394eec179cfSJacob Faibussowitsch PetscHMapI ta; 3950298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 39625296bd5SBarry Smith 39725296bd5SBarry Smith PetscFunctionBegin; 3983c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 3993c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 4009566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 4013c50cad2SHong Zhang ci[0] = 0; 4023c50cad2SHong Zhang 4033c50cad2SHong Zhang /* create and initialize a linked list */ 404eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 405c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 406eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 407eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 408eca6b86aSHong Zhang 4099566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 4103c50cad2SHong Zhang 4113c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 4129566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 4133c50cad2SHong Zhang current_space = free_space; 4143c50cad2SHong Zhang 4153c50cad2SHong Zhang /* Determine ci and cj */ 4163c50cad2SHong Zhang for (i = 0; i < am; i++) { 4173c50cad2SHong Zhang anzi = ai[i + 1] - ai[i]; 4183c50cad2SHong Zhang aj = a->j + ai[i]; 4193c50cad2SHong Zhang for (j = 0; j < anzi; j++) { 4203c50cad2SHong Zhang brow = aj[j]; 4213c50cad2SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 4223c50cad2SHong Zhang bj = b->j + bi[brow]; 4233c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 4249566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 4253c50cad2SHong Zhang } 4268c78258cSHong Zhang /* add possible missing diagonal entry */ 42748a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 4283c50cad2SHong Zhang cnzi = lnk[1]; 4293c50cad2SHong Zhang 4303c50cad2SHong Zhang /* If free space is not available, make more free space */ 4313c50cad2SHong Zhang /* Double the amount of total space in the list */ 4323c50cad2SHong Zhang if (current_space->local_remaining < cnzi) { 4339566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 4343c50cad2SHong Zhang ndouble++; 4353c50cad2SHong Zhang } 4363c50cad2SHong Zhang 4373c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 4389566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 4392205254eSKarl Rupp 4403c50cad2SHong Zhang current_space->array += cnzi; 4413c50cad2SHong Zhang current_space->local_used += cnzi; 4423c50cad2SHong Zhang current_space->local_remaining -= cnzi; 4432205254eSKarl Rupp 4443c50cad2SHong Zhang ci[i + 1] = ci[i] + cnzi; 4453c50cad2SHong Zhang } 4463c50cad2SHong Zhang 4473c50cad2SHong Zhang /* Column indices are in the list of free space */ 4483c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 4493c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 4509566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 4519566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 4529566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy_fast(lnk)); 45325296bd5SBarry Smith 45425296bd5SBarry Smith /* Allocate space for ca */ 4559566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 45625296bd5SBarry Smith 45725296bd5SBarry Smith /* put together the new symbolic matrix */ 4589566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 4599566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 46025296bd5SBarry Smith 46125296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 46225296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 463f4f49eeaSPierre Jolivet c = (Mat_SeqAIJ *)C->data; 46425296bd5SBarry Smith c->free_a = PETSC_TRUE; 46525296bd5SBarry Smith c->free_ij = PETSC_TRUE; 46625296bd5SBarry Smith c->nonew = 0; 4672205254eSKarl Rupp 4684222ddf1SHong Zhang /* slower, less memory */ 4694222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 47025296bd5SBarry Smith 47125296bd5SBarry Smith /* set MatInfo */ 47225296bd5SBarry Smith afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 47325296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 4744222ddf1SHong Zhang C->info.mallocs = ndouble; 4754222ddf1SHong Zhang C->info.fill_ratio_given = fill; 4764222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 47725296bd5SBarry Smith 47825296bd5SBarry Smith #if defined(PETSC_USE_INFO) 47925296bd5SBarry Smith if (ci[am]) { 4809566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 4819566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 48225296bd5SBarry Smith } else { 4839566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 48425296bd5SBarry Smith } 48525296bd5SBarry Smith #endif 4863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 48725296bd5SBarry Smith } 48825296bd5SBarry Smith 489d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 490d71ae5a4SJacob Faibussowitsch { 491e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 492bf9555e6SHong Zhang PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 49325c41797SHong Zhang PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 494e9e4536cSHong Zhang MatScalar *ca; 495e9e4536cSHong Zhang PetscReal afill; 496eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 497eec179cfSJacob Faibussowitsch PetscHMapI ta; 4980298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 499e9e4536cSHong Zhang 500e9e4536cSHong Zhang PetscFunctionBegin; 501bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 502bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 5039566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 504bd958071SHong Zhang ci[0] = 0; 505bd958071SHong Zhang 506bd958071SHong Zhang /* create and initialize a linked list */ 507eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 508c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 509eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 510eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 5119566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 512bd958071SHong Zhang 513bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 5149566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 515bd958071SHong Zhang current_space = free_space; 516bd958071SHong Zhang 517bd958071SHong Zhang /* Determine ci and cj */ 518bd958071SHong Zhang for (i = 0; i < am; i++) { 519bd958071SHong Zhang anzi = ai[i + 1] - ai[i]; 520bd958071SHong Zhang aj = a->j + ai[i]; 521bd958071SHong Zhang for (j = 0; j < anzi; j++) { 522bd958071SHong Zhang brow = aj[j]; 523bd958071SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 524bd958071SHong Zhang bj = b->j + bi[brow]; 525bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 5269566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 527bd958071SHong Zhang } 5288c78258cSHong Zhang /* add possible missing diagonal entry */ 52948a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 5308c78258cSHong Zhang 531bd958071SHong Zhang cnzi = lnk[0]; 532bd958071SHong Zhang 533bd958071SHong Zhang /* If free space is not available, make more free space */ 534bd958071SHong Zhang /* Double the amount of total space in the list */ 535bd958071SHong Zhang if (current_space->local_remaining < cnzi) { 5369566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 537bd958071SHong Zhang ndouble++; 538bd958071SHong Zhang } 539bd958071SHong Zhang 540bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 5419566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 5422205254eSKarl Rupp 543bd958071SHong Zhang current_space->array += cnzi; 544bd958071SHong Zhang current_space->local_used += cnzi; 545bd958071SHong Zhang current_space->local_remaining -= cnzi; 5462205254eSKarl Rupp 547bd958071SHong Zhang ci[i + 1] = ci[i] + cnzi; 548bd958071SHong Zhang } 549bd958071SHong Zhang 550bd958071SHong Zhang /* Column indices are in the list of free space */ 551bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 552bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 5539566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 5549566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 5559566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 556e9e4536cSHong Zhang 557e9e4536cSHong Zhang /* Allocate space for ca */ 5589566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 559e9e4536cSHong Zhang 560e9e4536cSHong Zhang /* put together the new symbolic matrix */ 5619566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 5629566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 563e9e4536cSHong Zhang 564e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 565e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 566f4f49eeaSPierre Jolivet c = (Mat_SeqAIJ *)C->data; 567e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 568e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 569e9e4536cSHong Zhang c->nonew = 0; 5702205254eSKarl Rupp 5714222ddf1SHong Zhang /* slower, less memory */ 5724222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 573e9e4536cSHong Zhang 574e9e4536cSHong Zhang /* set MatInfo */ 575e9e4536cSHong Zhang afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 576e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 5774222ddf1SHong Zhang C->info.mallocs = ndouble; 5784222ddf1SHong Zhang C->info.fill_ratio_given = fill; 5794222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 580e9e4536cSHong Zhang 581e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 582e9e4536cSHong Zhang if (ci[am]) { 5839566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 5849566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 585e9e4536cSHong Zhang } else { 5869566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 587e9e4536cSHong Zhang } 588e9e4536cSHong Zhang #endif 5893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 590e9e4536cSHong Zhang } 591e9e4536cSHong Zhang 592d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 593d71ae5a4SJacob Faibussowitsch { 5940ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 5950ced3a2bSJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 5960ced3a2bSJed Brown PetscInt *ci, *cj, *bb; 5970ced3a2bSJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 5980ced3a2bSJed Brown PetscReal afill; 5990ced3a2bSJed Brown PetscInt i, j, col, ndouble = 0; 6000298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 6010ced3a2bSJed Brown PetscHeap h; 6020ced3a2bSJed Brown 6030ced3a2bSJed Brown PetscFunctionBegin; 604cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 6050ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 6069566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 6070ced3a2bSJed Brown ci[0] = 0; 6080ced3a2bSJed Brown 6090ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 6109566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 6110ced3a2bSJed Brown current_space = free_space; 6120ced3a2bSJed Brown 6139566063dSJacob Faibussowitsch PetscCall(PetscHeapCreate(a->rmax, &h)); 6149566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->rmax, &bb)); 6150ced3a2bSJed Brown 6160ced3a2bSJed Brown /* Determine ci and cj */ 6170ced3a2bSJed Brown for (i = 0; i < am; i++) { 6180ced3a2bSJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6190ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6200ced3a2bSJed Brown ci[i + 1] = ci[i]; 6210ced3a2bSJed Brown /* Populate the min heap */ 6220ced3a2bSJed Brown for (j = 0; j < anzi; j++) { 6230ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 6240ced3a2bSJed Brown if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 6259566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 6260ced3a2bSJed Brown } 6270ced3a2bSJed Brown } 6280ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 6299566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 6300ced3a2bSJed Brown while (j >= 0) { 6310ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 6329566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 6330ced3a2bSJed Brown ndouble++; 6340ced3a2bSJed Brown } 6350ced3a2bSJed Brown *(current_space->array++) = col; 6360ced3a2bSJed Brown current_space->local_used++; 6370ced3a2bSJed Brown current_space->local_remaining--; 6380ced3a2bSJed Brown ci[i + 1]++; 6390ced3a2bSJed Brown 6400ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 6419566063dSJacob Faibussowitsch if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 6420ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 6430ced3a2bSJed Brown PetscInt j2, col2; 6449566063dSJacob Faibussowitsch PetscCall(PetscHeapPeek(h, &j2, &col2)); 6450ced3a2bSJed Brown if (col2 != col) break; 6469566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j2, &col2)); 6479566063dSJacob Faibussowitsch if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 6480ced3a2bSJed Brown } 6490ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 6509566063dSJacob Faibussowitsch PetscCall(PetscHeapUnstash(h)); 6519566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 6520ced3a2bSJed Brown } 6530ced3a2bSJed Brown } 6549566063dSJacob Faibussowitsch PetscCall(PetscFree(bb)); 6559566063dSJacob Faibussowitsch PetscCall(PetscHeapDestroy(&h)); 6560ced3a2bSJed Brown 6570ced3a2bSJed Brown /* Column indices are in the list of free space */ 6580ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 6590ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 6609566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am], &cj)); 6619566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 6620ced3a2bSJed Brown 6630ced3a2bSJed Brown /* put together the new symbolic matrix */ 6649566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 6659566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 6660ced3a2bSJed Brown 6670ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6680ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 669f4f49eeaSPierre Jolivet c = (Mat_SeqAIJ *)C->data; 6700ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6710ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 6720ced3a2bSJed Brown c->nonew = 0; 67326fbe8dcSKarl Rupp 6744222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 6750ced3a2bSJed Brown 6760ced3a2bSJed Brown /* set MatInfo */ 6770ced3a2bSJed Brown afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 6780ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6794222ddf1SHong Zhang C->info.mallocs = ndouble; 6804222ddf1SHong Zhang C->info.fill_ratio_given = fill; 6814222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 6820ced3a2bSJed Brown 6830ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 6840ced3a2bSJed Brown if (ci[am]) { 6859566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 6869566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 6870ced3a2bSJed Brown } else { 6889566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 6890ced3a2bSJed Brown } 6900ced3a2bSJed Brown #endif 6913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6920ced3a2bSJed Brown } 693e9e4536cSHong Zhang 694d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 695d71ae5a4SJacob Faibussowitsch { 6968a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 6978a07c6f1SJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 6988a07c6f1SJed Brown PetscInt *ci, *cj, *bb; 6998a07c6f1SJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 7008a07c6f1SJed Brown PetscReal afill; 7018a07c6f1SJed Brown PetscInt i, j, col, ndouble = 0; 7020298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 7038a07c6f1SJed Brown PetscHeap h; 7048a07c6f1SJed Brown PetscBT bt; 7058a07c6f1SJed Brown 7068a07c6f1SJed Brown PetscFunctionBegin; 707cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 7088a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 7099566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 7108a07c6f1SJed Brown ci[0] = 0; 7118a07c6f1SJed Brown 7128a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 7139566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 7142205254eSKarl Rupp 7158a07c6f1SJed Brown current_space = free_space; 7168a07c6f1SJed Brown 7179566063dSJacob Faibussowitsch PetscCall(PetscHeapCreate(a->rmax, &h)); 7189566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->rmax, &bb)); 7199566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(bn, &bt)); 7208a07c6f1SJed Brown 7218a07c6f1SJed Brown /* Determine ci and cj */ 7228a07c6f1SJed Brown for (i = 0; i < am; i++) { 7238a07c6f1SJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 7248a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 7258a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 7268a07c6f1SJed Brown ci[i + 1] = ci[i]; 7278a07c6f1SJed Brown /* Populate the min heap */ 7288a07c6f1SJed Brown for (j = 0; j < anzi; j++) { 7298a07c6f1SJed Brown PetscInt brow = acol[j]; 7308a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 7318a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7328a07c6f1SJed Brown if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 7339566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bcol)); 7348a07c6f1SJed Brown bb[j]++; 7358a07c6f1SJed Brown break; 7368a07c6f1SJed Brown } 7378a07c6f1SJed Brown } 7388a07c6f1SJed Brown } 7398a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 7409566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 7418a07c6f1SJed Brown while (j >= 0) { 7428a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 7430298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 7449566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 7458a07c6f1SJed Brown ndouble++; 7468a07c6f1SJed Brown } 7478a07c6f1SJed Brown *(current_space->array++) = col; 7488a07c6f1SJed Brown current_space->local_used++; 7498a07c6f1SJed Brown current_space->local_remaining--; 7508a07c6f1SJed Brown ci[i + 1]++; 7518a07c6f1SJed Brown 7528a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 7538a07c6f1SJed Brown for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 7548a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7558a07c6f1SJed Brown if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 7569566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bcol)); 7578a07c6f1SJed Brown bb[j]++; 7588a07c6f1SJed Brown break; 7598a07c6f1SJed Brown } 7608a07c6f1SJed Brown } 7619566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 7628a07c6f1SJed Brown } 7638a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7649566063dSJacob Faibussowitsch for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 7658a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7669566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(bn, bt)); 7678a07c6f1SJed Brown } 7688a07c6f1SJed Brown } 7699566063dSJacob Faibussowitsch PetscCall(PetscFree(bb)); 7709566063dSJacob Faibussowitsch PetscCall(PetscHeapDestroy(&h)); 7719566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&bt)); 7728a07c6f1SJed Brown 7738a07c6f1SJed Brown /* Column indices are in the list of free space */ 7748a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7758a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 7769566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am], &cj)); 7779566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 7788a07c6f1SJed Brown 7798a07c6f1SJed Brown /* put together the new symbolic matrix */ 7809566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 7819566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 7828a07c6f1SJed Brown 7838a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 7848a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 785f4f49eeaSPierre Jolivet c = (Mat_SeqAIJ *)C->data; 7868a07c6f1SJed Brown c->free_a = PETSC_TRUE; 7878a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 7888a07c6f1SJed Brown c->nonew = 0; 78926fbe8dcSKarl Rupp 7904222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 7918a07c6f1SJed Brown 7928a07c6f1SJed Brown /* set MatInfo */ 7938a07c6f1SJed Brown afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 7948a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 7954222ddf1SHong Zhang C->info.mallocs = ndouble; 7964222ddf1SHong Zhang C->info.fill_ratio_given = fill; 7974222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 7988a07c6f1SJed Brown 7998a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 8008a07c6f1SJed Brown if (ci[am]) { 8019566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 8029566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 8038a07c6f1SJed Brown } else { 8049566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 8058a07c6f1SJed Brown } 8068a07c6f1SJed Brown #endif 8073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 8088a07c6f1SJed Brown } 8098a07c6f1SJed Brown 810d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 811d71ae5a4SJacob Faibussowitsch { 812d7ed1a4dSandi selinger Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 813d7ed1a4dSandi selinger const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 814d7ed1a4dSandi selinger PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 815d7ed1a4dSandi selinger PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 816d7ed1a4dSandi selinger const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 817d7ed1a4dSandi selinger const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 818d7ed1a4dSandi selinger const PetscInt *brow_ptr[8], *brow_end[8]; 819d7ed1a4dSandi selinger PetscInt window[8]; 820d7ed1a4dSandi selinger PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 821d7ed1a4dSandi selinger PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 822d7ed1a4dSandi selinger PetscReal afill; 823f83700f2Sandi selinger PetscInt *workj_L1, *workj_L2, *workj_L3; 8247660c4dbSandi selinger PetscInt L1_nnz, L2_nnz; 825d7ed1a4dSandi selinger 826d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 827d7ed1a4dSandi selinger Because of the way virtual memory works, 828d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 829d7ed1a4dSandi selinger PetscFunctionBegin; 8309566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ci)); 831d7ed1a4dSandi selinger for (i = 0; i < am; i++) { 832d7ed1a4dSandi selinger const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 833d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 834d7ed1a4dSandi selinger a_rownnz = 0; 835d7ed1a4dSandi selinger for (k = 0; k < anzi; ++k) { 836d7ed1a4dSandi selinger a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 837d7ed1a4dSandi selinger if (a_rownnz > bn) { 838d7ed1a4dSandi selinger a_rownnz = bn; 839d7ed1a4dSandi selinger break; 840d7ed1a4dSandi selinger } 841d7ed1a4dSandi selinger } 842d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 843d7ed1a4dSandi selinger } 844d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 8459566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 8469566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 8479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 848d7ed1a4dSandi selinger 8497660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 8507660c4dbSandi selinger c_maxmem = 8 * (ai[am] + bi[bm]); 851d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 8529566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c_maxmem, &cj)); 853d7ed1a4dSandi selinger 854d7ed1a4dSandi selinger ci_nnz = 0; 855d7ed1a4dSandi selinger ci[0] = 0; 856d7ed1a4dSandi selinger worki_L1[0] = 0; 857d7ed1a4dSandi selinger worki_L2[0] = 0; 858d7ed1a4dSandi selinger for (i = 0; i < am; i++) { 859d7ed1a4dSandi selinger const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 860d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 861d7ed1a4dSandi selinger rowsleft = anzi; 862d7ed1a4dSandi selinger inputcol_L1 = acol; 863d7ed1a4dSandi selinger L2_nnz = 0; 8647660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8657660c4dbSandi selinger worki_L2[1] = 0; 86608fe1b3cSKarl Rupp outputi_nnz = 0; 867d7ed1a4dSandi selinger 868d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 869d7ed1a4dSandi selinger while (ci_nnz + a_maxrownnz > c_maxmem) { 870d7ed1a4dSandi selinger c_maxmem *= 2; 871d7ed1a4dSandi selinger ndouble++; 8729566063dSJacob Faibussowitsch PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 873d7ed1a4dSandi selinger } 874d7ed1a4dSandi selinger 875d7ed1a4dSandi selinger while (rowsleft) { 876d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 877d7ed1a4dSandi selinger L1_nrows = 0; 8787660c4dbSandi selinger L1_nnz = 0; 879d7ed1a4dSandi selinger inputcol = inputcol_L1; 880d7ed1a4dSandi selinger inputi = bi; 881d7ed1a4dSandi selinger inputj = bj; 882d7ed1a4dSandi selinger 883d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 884d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 885f83700f2Sandi selinger Input: inputj inputi inputcol bn 886d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 887d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 888a8f51744SPierre Jolivet do { \ 889d7ed1a4dSandi selinger window_min = bn; \ 8907660c4dbSandi selinger outputi_nnz = 0; \ 8917660c4dbSandi selinger for (k = 0; k < ANNZ; ++k) { \ 892d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 893d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 894d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 895d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 896d7ed1a4dSandi selinger } \ 897d7ed1a4dSandi selinger while (window_min < bn) { \ 898d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 899d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 900d7ed1a4dSandi selinger old_window_min = window_min; \ 901d7ed1a4dSandi selinger window_min = bn; \ 902d7ed1a4dSandi selinger for (k = 0; k < ANNZ; ++k) { \ 903d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 904d7ed1a4dSandi selinger brow_ptr[k]++; \ 905d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 906d7ed1a4dSandi selinger } \ 907d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 908d7ed1a4dSandi selinger } \ 909a8f51744SPierre Jolivet } \ 910a8f51744SPierre Jolivet } while (0) 911d7ed1a4dSandi selinger 912d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 913d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 914d7ed1a4dSandi selinger while (L1_rowsleft) { 9157660c4dbSandi selinger outputi_nnz = 0; 9167660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 9177660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 9187660c4dbSandi selinger 919d7ed1a4dSandi selinger switch (L1_rowsleft) { 9209371c9d4SSatish Balay case 1: 9219371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 922d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 923d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 924d7ed1a4dSandi selinger inputcol += L1_rowsleft; 925d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 926d7ed1a4dSandi selinger L1_rowsleft = 0; 927d7ed1a4dSandi selinger break; 9289371c9d4SSatish Balay case 2: 9299371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(2); 930d7ed1a4dSandi selinger inputcol += L1_rowsleft; 931d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 932d7ed1a4dSandi selinger L1_rowsleft = 0; 933d7ed1a4dSandi selinger break; 9349371c9d4SSatish Balay case 3: 9359371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(3); 936d7ed1a4dSandi selinger inputcol += L1_rowsleft; 937d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 938d7ed1a4dSandi selinger L1_rowsleft = 0; 939d7ed1a4dSandi selinger break; 9409371c9d4SSatish Balay case 4: 9419371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(4); 942d7ed1a4dSandi selinger inputcol += L1_rowsleft; 943d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 944d7ed1a4dSandi selinger L1_rowsleft = 0; 945d7ed1a4dSandi selinger break; 9469371c9d4SSatish Balay case 5: 9479371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(5); 948d7ed1a4dSandi selinger inputcol += L1_rowsleft; 949d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 950d7ed1a4dSandi selinger L1_rowsleft = 0; 951d7ed1a4dSandi selinger break; 9529371c9d4SSatish Balay case 6: 9539371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(6); 954d7ed1a4dSandi selinger inputcol += L1_rowsleft; 955d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 956d7ed1a4dSandi selinger L1_rowsleft = 0; 957d7ed1a4dSandi selinger break; 9589371c9d4SSatish Balay case 7: 9599371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(7); 960d7ed1a4dSandi selinger inputcol += L1_rowsleft; 961d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 962d7ed1a4dSandi selinger L1_rowsleft = 0; 963d7ed1a4dSandi selinger break; 9649371c9d4SSatish Balay default: 9659371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(8); 966d7ed1a4dSandi selinger inputcol += 8; 967d7ed1a4dSandi selinger rowsleft -= 8; 968d7ed1a4dSandi selinger L1_rowsleft -= 8; 969d7ed1a4dSandi selinger break; 970d7ed1a4dSandi selinger } 971d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9727660c4dbSandi selinger L1_nnz += outputi_nnz; 9737660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 974d7ed1a4dSandi selinger } 975d7ed1a4dSandi selinger 976d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 977d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 978d7ed1a4dSandi selinger if (anzi > 8) { 979d7ed1a4dSandi selinger inputi = worki_L1; 980d7ed1a4dSandi selinger inputj = workj_L1; 981d7ed1a4dSandi selinger inputcol = workcol; 982d7ed1a4dSandi selinger outputi_nnz = 0; 983d7ed1a4dSandi selinger 984d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 985d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 986d7ed1a4dSandi selinger 987d7ed1a4dSandi selinger switch (L1_nrows) { 9889371c9d4SSatish Balay case 1: 9899371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 990d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 991d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 992d7ed1a4dSandi selinger break; 993d71ae5a4SJacob Faibussowitsch case 2: 994d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(2); 995d71ae5a4SJacob Faibussowitsch break; 996d71ae5a4SJacob Faibussowitsch case 3: 997d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(3); 998d71ae5a4SJacob Faibussowitsch break; 999d71ae5a4SJacob Faibussowitsch case 4: 1000d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(4); 1001d71ae5a4SJacob Faibussowitsch break; 1002d71ae5a4SJacob Faibussowitsch case 5: 1003d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(5); 1004d71ae5a4SJacob Faibussowitsch break; 1005d71ae5a4SJacob Faibussowitsch case 6: 1006d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(6); 1007d71ae5a4SJacob Faibussowitsch break; 1008d71ae5a4SJacob Faibussowitsch case 7: 1009d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(7); 1010d71ae5a4SJacob Faibussowitsch break; 1011d71ae5a4SJacob Faibussowitsch case 8: 1012d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(8); 1013d71ae5a4SJacob Faibussowitsch break; 1014d71ae5a4SJacob Faibussowitsch default: 1015d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1016d7ed1a4dSandi selinger } 1017d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 1018d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 1019d7ed1a4dSandi selinger 10207660c4dbSandi selinger /************************ L E V E L 3 **********************/ 10217660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 1022d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1023d7ed1a4dSandi selinger inputi = worki_L2; 1024d7ed1a4dSandi selinger inputj = workj_L2; 1025d7ed1a4dSandi selinger inputcol = workcol; 1026d7ed1a4dSandi selinger outputi_nnz = 0; 1027f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 1028d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 1029d7ed1a4dSandi selinger switch (L2_nrows) { 10309371c9d4SSatish Balay case 1: 10319371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 1032d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1033d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1034d7ed1a4dSandi selinger break; 1035d71ae5a4SJacob Faibussowitsch case 2: 1036d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(2); 1037d71ae5a4SJacob Faibussowitsch break; 1038d71ae5a4SJacob Faibussowitsch case 3: 1039d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(3); 1040d71ae5a4SJacob Faibussowitsch break; 1041d71ae5a4SJacob Faibussowitsch case 4: 1042d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(4); 1043d71ae5a4SJacob Faibussowitsch break; 1044d71ae5a4SJacob Faibussowitsch case 5: 1045d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(5); 1046d71ae5a4SJacob Faibussowitsch break; 1047d71ae5a4SJacob Faibussowitsch case 6: 1048d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(6); 1049d71ae5a4SJacob Faibussowitsch break; 1050d71ae5a4SJacob Faibussowitsch case 7: 1051d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(7); 1052d71ae5a4SJacob Faibussowitsch break; 1053d71ae5a4SJacob Faibussowitsch case 8: 1054d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(8); 1055d71ae5a4SJacob Faibussowitsch break; 1056d71ae5a4SJacob Faibussowitsch default: 1057d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1058d7ed1a4dSandi selinger } 1059d7ed1a4dSandi selinger L2_nrows = 1; 10607660c4dbSandi selinger L2_nnz = outputi_nnz; 10617660c4dbSandi selinger worki_L2[1] = outputi_nnz; 10627660c4dbSandi selinger /* Copy to workj_L2 */ 1063d7ed1a4dSandi selinger if (rowsleft) { 10647660c4dbSandi selinger for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1065d7ed1a4dSandi selinger } 1066d7ed1a4dSandi selinger } 1067d7ed1a4dSandi selinger } 1068d7ed1a4dSandi selinger } /* while (rowsleft) */ 1069d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 1070d7ed1a4dSandi selinger 1071d7ed1a4dSandi selinger /* terminate current row */ 1072d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 1073d7ed1a4dSandi selinger ci[i + 1] = ci_nnz; 1074d7ed1a4dSandi selinger } 1075d7ed1a4dSandi selinger 1076d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 10779566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 10789566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1079d7ed1a4dSandi selinger 1080d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1081d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1082f4f49eeaSPierre Jolivet c = (Mat_SeqAIJ *)C->data; 1083d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 1084d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 1085d7ed1a4dSandi selinger c->nonew = 0; 1086d7ed1a4dSandi selinger 10874222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1088d7ed1a4dSandi selinger 1089d7ed1a4dSandi selinger /* set MatInfo */ 1090d7ed1a4dSandi selinger afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1091d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 10924222ddf1SHong Zhang C->info.mallocs = ndouble; 10934222ddf1SHong Zhang C->info.fill_ratio_given = fill; 10944222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1095d7ed1a4dSandi selinger 1096d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1097d7ed1a4dSandi selinger if (ci[am]) { 10989566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 10999566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1100d7ed1a4dSandi selinger } else { 11019566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 1102d7ed1a4dSandi selinger } 1103d7ed1a4dSandi selinger #endif 1104d7ed1a4dSandi selinger 1105d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 11069566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L1)); 11079566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L2)); 11089566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L3)); 11093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1110d7ed1a4dSandi selinger } 1111d7ed1a4dSandi selinger 1112cd093f1dSJed Brown /* concatenate unique entries and then sort */ 1113d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1114d71ae5a4SJacob Faibussowitsch { 1115cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1116cd093f1dSJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 11178c78258cSHong Zhang PetscInt *ci, *cj, bcol; 1118cd093f1dSJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1119cd093f1dSJed Brown PetscReal afill; 1120cd093f1dSJed Brown PetscInt i, j, ndouble = 0; 1121cd093f1dSJed Brown PetscSegBuffer seg, segrow; 1122cd093f1dSJed Brown char *seen; 1123cd093f1dSJed Brown 1124cd093f1dSJed Brown PetscFunctionBegin; 11259566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ci)); 1126cd093f1dSJed Brown ci[0] = 0; 1127cd093f1dSJed Brown 1128cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 11299566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 11309566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 11319566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(bn, &seen)); 1132cd093f1dSJed Brown 1133cd093f1dSJed Brown /* Determine ci and cj */ 1134cd093f1dSJed Brown for (i = 0; i < am; i++) { 1135cd093f1dSJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 11368e3a54c0SPierre Jolivet const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */ 1137cd093f1dSJed Brown PetscInt packlen = 0, *PETSC_RESTRICT crow; 11388c78258cSHong Zhang 1139cd093f1dSJed Brown /* Pack segrow */ 1140cd093f1dSJed Brown for (j = 0; j < anzi; j++) { 1141cd093f1dSJed Brown PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1142cd093f1dSJed Brown for (k = bjstart; k < bjend; k++) { 11438c78258cSHong Zhang bcol = bj[k]; 1144cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1145cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 11469566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1147cd093f1dSJed Brown *slot = bcol; 1148cd093f1dSJed Brown seen[bcol] = 1; 1149cd093f1dSJed Brown packlen++; 1150cd093f1dSJed Brown } 1151cd093f1dSJed Brown } 1152cd093f1dSJed Brown } 11538c78258cSHong Zhang 11548c78258cSHong Zhang /* Check i-th diagonal entry */ 11558c78258cSHong Zhang if (C->force_diagonals && !seen[i]) { 11568c78258cSHong Zhang PetscInt *PETSC_RESTRICT slot; 11579566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 11588c78258cSHong Zhang *slot = i; 11598c78258cSHong Zhang seen[i] = 1; 11608c78258cSHong Zhang packlen++; 11618c78258cSHong Zhang } 11628c78258cSHong Zhang 11639566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 11649566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractTo(segrow, crow)); 11659566063dSJacob Faibussowitsch PetscCall(PetscSortInt(packlen, crow)); 1166cd093f1dSJed Brown ci[i + 1] = ci[i] + packlen; 1167cd093f1dSJed Brown for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1168cd093f1dSJed Brown } 11699566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segrow)); 11709566063dSJacob Faibussowitsch PetscCall(PetscFree(seen)); 1171cd093f1dSJed Brown 1172cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 11739566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 11749566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&seg)); 1175cd093f1dSJed Brown 1176cd093f1dSJed Brown /* put together the new symbolic matrix */ 11779566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 11789566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1179cd093f1dSJed Brown 1180cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1181cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1182f4f49eeaSPierre Jolivet c = (Mat_SeqAIJ *)C->data; 1183cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1184cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1185cd093f1dSJed Brown c->nonew = 0; 1186cd093f1dSJed Brown 11874222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1188cd093f1dSJed Brown 1189cd093f1dSJed Brown /* set MatInfo */ 11902a09556fSStefano Zampini afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1191cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 11924222ddf1SHong Zhang C->info.mallocs = ndouble; 11934222ddf1SHong Zhang C->info.fill_ratio_given = fill; 11944222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1195cd093f1dSJed Brown 1196cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1197cd093f1dSJed Brown if (ci[am]) { 11989566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 11999566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1200cd093f1dSJed Brown } else { 12019566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 1202cd093f1dSJed Brown } 1203cd093f1dSJed Brown #endif 12043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1205cd093f1dSJed Brown } 1206cd093f1dSJed Brown 120766976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1208d71ae5a4SJacob Faibussowitsch { 12096718818eSStefano Zampini Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 12102128a86cSHong Zhang 12112128a86cSHong Zhang PetscFunctionBegin; 12129566063dSJacob Faibussowitsch PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 12139566063dSJacob Faibussowitsch PetscCall(MatDestroy(&abt->Bt_den)); 12149566063dSJacob Faibussowitsch PetscCall(MatDestroy(&abt->ABt_den)); 12159566063dSJacob Faibussowitsch PetscCall(PetscFree(abt)); 12163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12172128a86cSHong Zhang } 12182128a86cSHong Zhang 1219d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1220d71ae5a4SJacob Faibussowitsch { 122181d82fe4SHong Zhang Mat Bt; 122240192850SHong Zhang Mat_MatMatTransMult *abt; 12234222ddf1SHong Zhang Mat_Product *product = C->product; 12246718818eSStefano Zampini char *alg; 1225d2853540SHong Zhang 122681d82fe4SHong Zhang PetscFunctionBegin; 122728b400f6SJacob Faibussowitsch PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 122828b400f6SJacob Faibussowitsch PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 12296718818eSStefano Zampini 123081d82fe4SHong Zhang /* create symbolic Bt */ 12317fb60732SBarry Smith PetscCall(MatTransposeSymbolic(B, &Bt)); 12329566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 12339566063dSJacob Faibussowitsch PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 123481d82fe4SHong Zhang 123581d82fe4SHong Zhang /* get symbolic C=A*Bt */ 12369566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(product->alg, &alg)); 12379566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 12389566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 12399566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 12409566063dSJacob Faibussowitsch PetscCall(PetscFree(alg)); 124181d82fe4SHong Zhang 1242a5b23f4aSJose E. Roman /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 12439566063dSJacob Faibussowitsch PetscCall(PetscNew(&abt)); 12442128a86cSHong Zhang 12456718818eSStefano Zampini product->data = abt; 12466718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 12476718818eSStefano Zampini 12484222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 12492128a86cSHong Zhang 12504222ddf1SHong Zhang abt->usecoloring = PETSC_FALSE; 12519566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 125240192850SHong Zhang if (abt->usecoloring) { 1253b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1254b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1255335efc43SPeter Brune MatColoring coloring; 12562128a86cSHong Zhang ISColoring iscoloring; 12572128a86cSHong Zhang Mat Bt_dense, C_dense; 1258e8354b3eSHong Zhang 12594222ddf1SHong Zhang /* inode causes memory problem */ 12609566063dSJacob Faibussowitsch PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 12614222ddf1SHong Zhang 12629566063dSJacob Faibussowitsch PetscCall(MatColoringCreate(C, &coloring)); 12639566063dSJacob Faibussowitsch PetscCall(MatColoringSetDistance(coloring, 2)); 12649566063dSJacob Faibussowitsch PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 12659566063dSJacob Faibussowitsch PetscCall(MatColoringSetFromOptions(coloring)); 12669566063dSJacob Faibussowitsch PetscCall(MatColoringApply(coloring, &iscoloring)); 12679566063dSJacob Faibussowitsch PetscCall(MatColoringDestroy(&coloring)); 12689566063dSJacob Faibussowitsch PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 12692205254eSKarl Rupp 127040192850SHong Zhang abt->matcoloring = matcoloring; 12712205254eSKarl Rupp 12729566063dSJacob Faibussowitsch PetscCall(ISColoringDestroy(&iscoloring)); 12732128a86cSHong Zhang 12742128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 12759566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 12769566063dSJacob Faibussowitsch PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 12779566063dSJacob Faibussowitsch PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 12789566063dSJacob Faibussowitsch PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 12792205254eSKarl Rupp 12802128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 128140192850SHong Zhang abt->Bt_den = Bt_dense; 12822128a86cSHong Zhang 12839566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 12849566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 12859566063dSJacob Faibussowitsch PetscCall(MatSetType(C_dense, MATSEQDENSE)); 12869566063dSJacob Faibussowitsch PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 12872205254eSKarl Rupp 12882128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 128940192850SHong Zhang abt->ABt_den = C_dense; 1290f94ccd6cSHong Zhang 1291f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12921ce71dffSSatish Balay { 12934222ddf1SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 12949371c9d4SSatish Balay PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1295f4f49eeaSPierre Jolivet Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 12961ce71dffSSatish Balay } 1297f94ccd6cSHong Zhang #endif 12982128a86cSHong Zhang } 1299e99be685SHong Zhang /* clean up */ 13009566063dSJacob Faibussowitsch PetscCall(MatDestroy(&Bt)); 13013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13025df89d91SHong Zhang } 13035df89d91SHong Zhang 1304d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1305d71ae5a4SJacob Faibussowitsch { 13065df89d91SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1307e2cac8caSJed Brown PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 130881d82fe4SHong Zhang PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 13095df89d91SHong Zhang PetscLogDouble flops = 0.0; 1310aa1aec99SHong Zhang MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 13116718818eSStefano Zampini Mat_MatMatTransMult *abt; 13126718818eSStefano Zampini Mat_Product *product = C->product; 13135df89d91SHong Zhang 13145df89d91SHong Zhang PetscFunctionBegin; 131528b400f6SJacob Faibussowitsch PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 13166718818eSStefano Zampini abt = (Mat_MatMatTransMult *)product->data; 131728b400f6SJacob Faibussowitsch PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 131858ed3ceaSHong Zhang /* clear old values in C */ 131958ed3ceaSHong Zhang if (!c->a) { 13209566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 132158ed3ceaSHong Zhang c->a = ca; 132258ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 132358ed3ceaSHong Zhang } else { 132458ed3ceaSHong Zhang ca = c->a; 13259566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 132658ed3ceaSHong Zhang } 132758ed3ceaSHong Zhang 132840192850SHong Zhang if (abt->usecoloring) { 132940192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 133040192850SHong Zhang Mat Bt_dense, C_dense = abt->ABt_den; 1331c8db22f6SHong Zhang 1332b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 133340192850SHong Zhang Bt_dense = abt->Bt_den; 13349566063dSJacob Faibussowitsch PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1335c8db22f6SHong Zhang 1336c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 13379566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1338c8db22f6SHong Zhang 13392128a86cSHong Zhang /* Recover C from C_dense */ 13409566063dSJacob Faibussowitsch PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 13413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1342c8db22f6SHong Zhang } 1343c8db22f6SHong Zhang 13441fa1209cSHong Zhang for (i = 0; i < cm; i++) { 134581d82fe4SHong Zhang anzi = ai[i + 1] - ai[i]; 13468e3a54c0SPierre Jolivet acol = PetscSafePointerPlusOffset(aj, ai[i]); 13478e3a54c0SPierre Jolivet aval = PetscSafePointerPlusOffset(aa, ai[i]); 13481fa1209cSHong Zhang cnzi = ci[i + 1] - ci[i]; 13498e3a54c0SPierre Jolivet ccol = PetscSafePointerPlusOffset(cj, ci[i]); 13506973769bSHong Zhang cval = ca + ci[i]; 13511fa1209cSHong Zhang for (j = 0; j < cnzi; j++) { 135281d82fe4SHong Zhang brow = ccol[j]; 135381d82fe4SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 135481d82fe4SHong Zhang bcol = bj + bi[brow]; 13556973769bSHong Zhang bval = ba + bi[brow]; 13566973769bSHong Zhang 135781d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 13589371c9d4SSatish Balay nexta = 0; 13599371c9d4SSatish Balay nextb = 0; 136081d82fe4SHong Zhang while (nexta < anzi && nextb < bnzj) { 13617b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 136281d82fe4SHong Zhang if (nexta == anzi) break; 13637b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 136481d82fe4SHong Zhang if (nextb == bnzj) break; 136581d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 13666973769bSHong Zhang cval[j] += aval[nexta] * bval[nextb]; 13679371c9d4SSatish Balay nexta++; 13689371c9d4SSatish Balay nextb++; 136981d82fe4SHong Zhang flops += 2; 13701fa1209cSHong Zhang } 13711fa1209cSHong Zhang } 137281d82fe4SHong Zhang } 137381d82fe4SHong Zhang } 13749566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 13759566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 13769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 13773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13785df89d91SHong Zhang } 13795df89d91SHong Zhang 1380d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1381d71ae5a4SJacob Faibussowitsch { 13826718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 13836d373c3eSHong Zhang 13846d373c3eSHong Zhang PetscFunctionBegin; 13859566063dSJacob Faibussowitsch PetscCall(MatDestroy(&atb->At)); 13861baa6e33SBarry Smith if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 13879566063dSJacob Faibussowitsch PetscCall(PetscFree(atb)); 13883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13896d373c3eSHong Zhang } 13906d373c3eSHong Zhang 1391d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1392d71ae5a4SJacob Faibussowitsch { 1393089a957eSStefano Zampini Mat At = NULL; 13944222ddf1SHong Zhang Mat_Product *product = C->product; 1395089a957eSStefano Zampini PetscBool flg, def, square; 1396bc011b1eSHong Zhang 1397bc011b1eSHong Zhang PetscFunctionBegin; 1398089a957eSStefano Zampini MatCheckProduct(C, 4); 1399b94d7dedSBarry Smith square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 14004222ddf1SHong Zhang /* outerproduct */ 14019566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 14024222ddf1SHong Zhang if (flg) { 1403bc011b1eSHong Zhang /* create symbolic At */ 1404089a957eSStefano Zampini if (!square) { 14057fb60732SBarry Smith PetscCall(MatTransposeSymbolic(A, &At)); 14069566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 14079566063dSJacob Faibussowitsch PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1408089a957eSStefano Zampini } 1409bc011b1eSHong Zhang /* get symbolic C=At*B */ 14109566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); 14119566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1412bc011b1eSHong Zhang 1413bc011b1eSHong Zhang /* clean up */ 141448a46eb9SPierre Jolivet if (!square) PetscCall(MatDestroy(&At)); 14156d373c3eSHong Zhang 14164222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 14179566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 14183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14194222ddf1SHong Zhang } 14204222ddf1SHong Zhang 14214222ddf1SHong Zhang /* matmatmult */ 14229566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &def)); 14239566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1424089a957eSStefano Zampini if (flg || def) { 14254222ddf1SHong Zhang Mat_MatTransMatMult *atb; 14264222ddf1SHong Zhang 142728b400f6SJacob Faibussowitsch PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 14289566063dSJacob Faibussowitsch PetscCall(PetscNew(&atb)); 142948a46eb9SPierre Jolivet if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 14309566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); 14319566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 14329566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "at*b")); 14336718818eSStefano Zampini product->data = atb; 14346718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 14354222ddf1SHong Zhang atb->At = At; 14364222ddf1SHong Zhang 14374222ddf1SHong Zhang C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 14383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14394222ddf1SHong Zhang } 14404222ddf1SHong Zhang 14414222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1442bc011b1eSHong Zhang } 1443bc011b1eSHong Zhang 1444d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1445d71ae5a4SJacob Faibussowitsch { 14460fbc74f4SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1447d0f46423SBarry Smith PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1448d0f46423SBarry Smith PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1449d13dce4bSSatish Balay PetscLogDouble flops = 0.0; 1450aa1aec99SHong Zhang MatScalar *aa = a->a, *ba, *ca, *caj; 1451bc011b1eSHong Zhang 1452bc011b1eSHong Zhang PetscFunctionBegin; 1453aa1aec99SHong Zhang if (!c->a) { 14549566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 14552205254eSKarl Rupp 1456aa1aec99SHong Zhang c->a = ca; 1457aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1458aa1aec99SHong Zhang } else { 1459aa1aec99SHong Zhang ca = c->a; 14609566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 1461aa1aec99SHong Zhang } 1462bc011b1eSHong Zhang 1463bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1464bc011b1eSHong Zhang for (i = 0; i < am; i++) { 1465bc011b1eSHong Zhang bj = b->j + bi[i]; 1466bc011b1eSHong Zhang ba = b->a + bi[i]; 1467bc011b1eSHong Zhang bnzi = bi[i + 1] - bi[i]; 1468bc011b1eSHong Zhang anzi = ai[i + 1] - ai[i]; 1469bc011b1eSHong Zhang for (j = 0; j < anzi; j++) { 1470bc011b1eSHong Zhang nextb = 0; 14710fbc74f4SHong Zhang crow = *aj++; 1472bc011b1eSHong Zhang cjj = cj + ci[crow]; 1473bc011b1eSHong Zhang caj = ca + ci[crow]; 1474bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1475bc011b1eSHong Zhang for (k = 0; nextb < bnzi; k++) { 14760fbc74f4SHong Zhang if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 14770fbc74f4SHong Zhang caj[k] += (*aa) * (*(ba + nextb)); 1478bc011b1eSHong Zhang nextb++; 1479bc011b1eSHong Zhang } 1480bc011b1eSHong Zhang } 1481bc011b1eSHong Zhang flops += 2 * bnzi; 14820fbc74f4SHong Zhang aa++; 1483bc011b1eSHong Zhang } 1484bc011b1eSHong Zhang } 1485bc011b1eSHong Zhang 1486bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 14879566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 14889566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 14899566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 14903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1491bc011b1eSHong Zhang } 14929123193aSHong Zhang 1493d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1494d71ae5a4SJacob Faibussowitsch { 14959123193aSHong Zhang PetscFunctionBegin; 14969566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 14974222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 14983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14999123193aSHong Zhang } 15009123193aSHong Zhang 1501d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1502d71ae5a4SJacob Faibussowitsch { 1503f32d5d43SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 15041ca9667aSStefano Zampini PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1505a4af7ca8SStefano Zampini const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1506daf57211SHong Zhang const PetscInt *aj; 150775f6d85dSStefano Zampini PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 150875f6d85dSStefano Zampini PetscInt clda; 150975f6d85dSStefano Zampini PetscInt am4, bm4, col, i, j, n; 15109123193aSHong Zhang 15119123193aSHong Zhang PetscFunctionBegin; 15123ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 15139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &av)); 151493aa15f2SStefano Zampini if (add) { 15159566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 151693aa15f2SStefano Zampini } else { 15179566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayWrite(C, &c)); 151893aa15f2SStefano Zampini } 15199566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayRead(B, &b)); 15209566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &bm)); 15219566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 152275f6d85dSStefano Zampini am4 = 4 * clda; 152375f6d85dSStefano Zampini bm4 = 4 * bm; 15245c0db29aSPierre Jolivet if (b) { 15259371c9d4SSatish Balay b1 = b; 15269371c9d4SSatish Balay b2 = b1 + bm; 15279371c9d4SSatish Balay b3 = b2 + bm; 15289371c9d4SSatish Balay b4 = b3 + bm; 15295c0db29aSPierre Jolivet } else b1 = b2 = b3 = b4 = NULL; 15309371c9d4SSatish Balay c1 = c; 15319371c9d4SSatish Balay c2 = c1 + clda; 15329371c9d4SSatish Balay c3 = c2 + clda; 15339371c9d4SSatish Balay c4 = c3 + clda; 15341ca9667aSStefano Zampini for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 15351ca9667aSStefano Zampini for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1536f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1537f32d5d43SBarry Smith n = a->i[i + 1] - a->i[i]; 15388e3a54c0SPierre Jolivet aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 15398e3a54c0SPierre Jolivet aa = PetscSafePointerPlusOffset(av, a->i[i]); 1540f32d5d43SBarry Smith for (j = 0; j < n; j++) { 15411ca9667aSStefano Zampini const PetscScalar aatmp = aa[j]; 15421ca9667aSStefano Zampini const PetscInt ajtmp = aj[j]; 1543730858b9SHong Zhang r1 += aatmp * b1[ajtmp]; 1544730858b9SHong Zhang r2 += aatmp * b2[ajtmp]; 1545730858b9SHong Zhang r3 += aatmp * b3[ajtmp]; 1546730858b9SHong Zhang r4 += aatmp * b4[ajtmp]; 15479123193aSHong Zhang } 154893aa15f2SStefano Zampini if (add) { 154987753ddeSHong Zhang c1[i] += r1; 155087753ddeSHong Zhang c2[i] += r2; 155187753ddeSHong Zhang c3[i] += r3; 155287753ddeSHong Zhang c4[i] += r4; 155393aa15f2SStefano Zampini } else { 155493aa15f2SStefano Zampini c1[i] = r1; 155593aa15f2SStefano Zampini c2[i] = r2; 155693aa15f2SStefano Zampini c3[i] = r3; 155793aa15f2SStefano Zampini c4[i] = r4; 155893aa15f2SStefano Zampini } 1559f32d5d43SBarry Smith } 15605c0db29aSPierre Jolivet if (b) { 15619371c9d4SSatish Balay b1 += bm4; 15629371c9d4SSatish Balay b2 += bm4; 15639371c9d4SSatish Balay b3 += bm4; 15649371c9d4SSatish Balay b4 += bm4; 15655c0db29aSPierre Jolivet } 15669371c9d4SSatish Balay c1 += am4; 15679371c9d4SSatish Balay c2 += am4; 15689371c9d4SSatish Balay c3 += am4; 15699371c9d4SSatish Balay c4 += am4; 1570f32d5d43SBarry Smith } 157193aa15f2SStefano Zampini /* process remaining columns */ 157293aa15f2SStefano Zampini if (col != cn) { 157393aa15f2SStefano Zampini PetscInt rc = cn - col; 157493aa15f2SStefano Zampini 157593aa15f2SStefano Zampini if (rc == 1) { 157693aa15f2SStefano Zampini for (i = 0; i < am; i++) { 1577f32d5d43SBarry Smith r1 = 0.0; 1578f32d5d43SBarry Smith n = a->i[i + 1] - a->i[i]; 15798e3a54c0SPierre Jolivet aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 15808e3a54c0SPierre Jolivet aa = PetscSafePointerPlusOffset(av, a->i[i]); 158193aa15f2SStefano Zampini for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 158293aa15f2SStefano Zampini if (add) c1[i] += r1; 158393aa15f2SStefano Zampini else c1[i] = r1; 158493aa15f2SStefano Zampini } 158593aa15f2SStefano Zampini } else if (rc == 2) { 158693aa15f2SStefano Zampini for (i = 0; i < am; i++) { 158793aa15f2SStefano Zampini r1 = r2 = 0.0; 158893aa15f2SStefano Zampini n = a->i[i + 1] - a->i[i]; 15898e3a54c0SPierre Jolivet aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 15908e3a54c0SPierre Jolivet aa = PetscSafePointerPlusOffset(av, a->i[i]); 1591f32d5d43SBarry Smith for (j = 0; j < n; j++) { 159293aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 159393aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 159493aa15f2SStefano Zampini r1 += aatmp * b1[ajtmp]; 159593aa15f2SStefano Zampini r2 += aatmp * b2[ajtmp]; 1596f32d5d43SBarry Smith } 159793aa15f2SStefano Zampini if (add) { 159887753ddeSHong Zhang c1[i] += r1; 159993aa15f2SStefano Zampini c2[i] += r2; 160093aa15f2SStefano Zampini } else { 160193aa15f2SStefano Zampini c1[i] = r1; 160293aa15f2SStefano Zampini c2[i] = r2; 1603f32d5d43SBarry Smith } 160493aa15f2SStefano Zampini } 160593aa15f2SStefano Zampini } else { 160693aa15f2SStefano Zampini for (i = 0; i < am; i++) { 160793aa15f2SStefano Zampini r1 = r2 = r3 = 0.0; 160893aa15f2SStefano Zampini n = a->i[i + 1] - a->i[i]; 16098e3a54c0SPierre Jolivet aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 16108e3a54c0SPierre Jolivet aa = PetscSafePointerPlusOffset(av, a->i[i]); 161193aa15f2SStefano Zampini for (j = 0; j < n; j++) { 161293aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 161393aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 161493aa15f2SStefano Zampini r1 += aatmp * b1[ajtmp]; 161593aa15f2SStefano Zampini r2 += aatmp * b2[ajtmp]; 161693aa15f2SStefano Zampini r3 += aatmp * b3[ajtmp]; 161793aa15f2SStefano Zampini } 161893aa15f2SStefano Zampini if (add) { 161993aa15f2SStefano Zampini c1[i] += r1; 162093aa15f2SStefano Zampini c2[i] += r2; 162193aa15f2SStefano Zampini c3[i] += r3; 162293aa15f2SStefano Zampini } else { 162393aa15f2SStefano Zampini c1[i] = r1; 162493aa15f2SStefano Zampini c2[i] = r2; 162593aa15f2SStefano Zampini c3[i] = r3; 162693aa15f2SStefano Zampini } 162793aa15f2SStefano Zampini } 162893aa15f2SStefano Zampini } 1629f32d5d43SBarry Smith } 16309566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 163193aa15f2SStefano Zampini if (add) { 16329566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 163393aa15f2SStefano Zampini } else { 16349566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayWrite(C, &c)); 163593aa15f2SStefano Zampini } 16369566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayRead(B, &b)); 16379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 16383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1639f32d5d43SBarry Smith } 1640f32d5d43SBarry Smith 1641d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1642d71ae5a4SJacob Faibussowitsch { 1643f32d5d43SBarry Smith PetscFunctionBegin; 164408401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 164508401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 164608401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 16474324174eSBarry Smith 16489566063dSJacob Faibussowitsch PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 16493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16509123193aSHong Zhang } 1651b1683b59SHong Zhang 1652d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1653d71ae5a4SJacob Faibussowitsch { 16544222ddf1SHong Zhang PetscFunctionBegin; 16554222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 16564222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16584222ddf1SHong Zhang } 16594222ddf1SHong Zhang 16606718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 16616718818eSStefano Zampini 1662d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1663d71ae5a4SJacob Faibussowitsch { 16644222ddf1SHong Zhang PetscFunctionBegin; 16656718818eSStefano Zampini C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16664222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB; 16673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16686718818eSStefano Zampini } 16696718818eSStefano Zampini 1670d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1671d71ae5a4SJacob Faibussowitsch { 16726718818eSStefano Zampini PetscFunctionBegin; 16736718818eSStefano Zampini C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16746718818eSStefano Zampini C->ops->productsymbolic = MatProductSymbolic_ABt; 16753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16764222ddf1SHong Zhang } 16774222ddf1SHong Zhang 1678d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1679d71ae5a4SJacob Faibussowitsch { 16804222ddf1SHong Zhang Mat_Product *product = C->product; 16814222ddf1SHong Zhang 16824222ddf1SHong Zhang PetscFunctionBegin; 16834222ddf1SHong Zhang switch (product->type) { 1684d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB: 1685d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1686d71ae5a4SJacob Faibussowitsch break; 1687d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB: 1688d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1689d71ae5a4SJacob Faibussowitsch break; 1690d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 1691d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1692d71ae5a4SJacob Faibussowitsch break; 1693d71ae5a4SJacob Faibussowitsch default: 1694d71ae5a4SJacob Faibussowitsch break; 16954222ddf1SHong Zhang } 16963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16974222ddf1SHong Zhang } 16982ef1f0ffSBarry Smith 1699d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1700d71ae5a4SJacob Faibussowitsch { 17014222ddf1SHong Zhang Mat_Product *product = C->product; 17024222ddf1SHong Zhang Mat A = product->A; 17034222ddf1SHong Zhang PetscBool baij; 17044222ddf1SHong Zhang 17054222ddf1SHong Zhang PetscFunctionBegin; 17069566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 17074222ddf1SHong Zhang if (!baij) { /* A is seqsbaij */ 17084222ddf1SHong Zhang PetscBool sbaij; 17099566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 171028b400f6SJacob Faibussowitsch PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 17114222ddf1SHong Zhang 17124222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 17134222ddf1SHong Zhang } else { /* A is seqbaij */ 17144222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 17154222ddf1SHong Zhang } 17164222ddf1SHong Zhang 17174222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17194222ddf1SHong Zhang } 17204222ddf1SHong Zhang 1721d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1722d71ae5a4SJacob Faibussowitsch { 17234222ddf1SHong Zhang Mat_Product *product = C->product; 17244222ddf1SHong Zhang 17254222ddf1SHong Zhang PetscFunctionBegin; 17266718818eSStefano Zampini MatCheckProduct(C, 1); 172728b400f6SJacob Faibussowitsch PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1728b94d7dedSBarry Smith if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 17293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17304222ddf1SHong Zhang } 17316718818eSStefano Zampini 1732d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1733d71ae5a4SJacob Faibussowitsch { 17344222ddf1SHong Zhang PetscFunctionBegin; 17354222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 17364222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17384222ddf1SHong Zhang } 17394222ddf1SHong Zhang 1740d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1741d71ae5a4SJacob Faibussowitsch { 17424222ddf1SHong Zhang Mat_Product *product = C->product; 17434222ddf1SHong Zhang 17444222ddf1SHong Zhang PetscFunctionBegin; 174548a46eb9SPierre Jolivet if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 17463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17474222ddf1SHong Zhang } 17484222ddf1SHong Zhang 1749d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1750d71ae5a4SJacob Faibussowitsch { 17512f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 17522f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 17532f5f1f90SHong Zhang PetscInt *bi = b->i, *bj = b->j; 17542f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 17552f5f1f90SHong Zhang MatScalar *btval, *btval_den, *ba = b->a; 17562f5f1f90SHong Zhang PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1757c8db22f6SHong Zhang 1758c8db22f6SHong Zhang PetscFunctionBegin; 17592f5f1f90SHong Zhang btval_den = btdense->v; 17609566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(btval_den, m * n)); 17612f5f1f90SHong Zhang for (k = 0; k < ncolors; k++) { 17622f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 17632f5f1f90SHong Zhang for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1764d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 17652f5f1f90SHong Zhang btcol = bj + bi[col]; 17662f5f1f90SHong Zhang btval = ba + bi[col]; 17672f5f1f90SHong Zhang anz = bi[col + 1] - bi[col]; 1768d2853540SHong Zhang for (j = 0; j < anz; j++) { 17692f5f1f90SHong Zhang brow = btcol[j]; 17702f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1771c8db22f6SHong Zhang } 1772c8db22f6SHong Zhang } 17732f5f1f90SHong Zhang btval_den += m; 1774c8db22f6SHong Zhang } 17753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1776c8db22f6SHong Zhang } 1777c8db22f6SHong Zhang 1778d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1779d71ae5a4SJacob Faibussowitsch { 1780b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 17811683a169SBarry Smith const PetscScalar *ca_den, *ca_den_ptr; 17821683a169SBarry Smith PetscScalar *ca = csp->a; 1783f99a636bSHong Zhang PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1784e88777f2SHong Zhang PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1785077f23c2SHong Zhang PetscInt nrows, *row, *idx; 1786077f23c2SHong Zhang PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 17878972f759SHong Zhang 17888972f759SHong Zhang PetscFunctionBegin; 17899566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1790f99a636bSHong Zhang 1791077f23c2SHong Zhang if (brows > 0) { 1792077f23c2SHong Zhang PetscInt *lstart, row_end, row_start; 1793980ae229SHong Zhang lstart = matcoloring->lstart; 17949566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(lstart, ncolors)); 1795eeb4fd02SHong Zhang 1796077f23c2SHong Zhang row_end = brows; 1797eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1798077f23c2SHong Zhang for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1799077f23c2SHong Zhang ca_den_ptr = ca_den; 1800980ae229SHong Zhang for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1801eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1802eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1803eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1804eeb4fd02SHong Zhang for (l = lstart[k]; l < nrows; l++) { 1805eeb4fd02SHong Zhang if (row[l] >= row_end) { 1806eeb4fd02SHong Zhang lstart[k] = l; 1807eeb4fd02SHong Zhang break; 1808eeb4fd02SHong Zhang } else { 1809077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1810eeb4fd02SHong Zhang } 1811eeb4fd02SHong Zhang } 1812077f23c2SHong Zhang ca_den_ptr += m; 1813eeb4fd02SHong Zhang } 1814077f23c2SHong Zhang row_end += brows; 1815eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1816eeb4fd02SHong Zhang } 1817077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1818077f23c2SHong Zhang ca_den_ptr = ca_den; 1819077f23c2SHong Zhang for (k = 0; k < ncolors; k++) { 1820077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1821077f23c2SHong Zhang row = rows + colorforrow[k]; 1822077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1823ad540459SPierre Jolivet for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1824077f23c2SHong Zhang ca_den_ptr += m; 1825077f23c2SHong Zhang } 1826f99a636bSHong Zhang } 1827f99a636bSHong Zhang 18289566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1829f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1830077f23c2SHong Zhang if (matcoloring->brows > 0) { 18319566063dSJacob Faibussowitsch PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1832e88777f2SHong Zhang } else { 18339566063dSJacob Faibussowitsch PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1834e88777f2SHong Zhang } 1835f99a636bSHong Zhang #endif 18363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18378972f759SHong Zhang } 18388972f759SHong Zhang 1839d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1840d71ae5a4SJacob Faibussowitsch { 1841e88777f2SHong Zhang PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 18421a83f524SJed Brown const PetscInt *is, *ci, *cj, *row_idx; 1843b28d80bdSHong Zhang PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1844b1683b59SHong Zhang IS *isa; 1845b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1846e88777f2SHong Zhang PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1847e88777f2SHong Zhang PetscInt *colorforcol, *columns, *columns_i, brows; 1848e88777f2SHong Zhang PetscBool flg; 1849b1683b59SHong Zhang 1850b1683b59SHong Zhang PetscFunctionBegin; 18519566063dSJacob Faibussowitsch PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1852e99be685SHong Zhang 18537ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1854e88777f2SHong Zhang Nbs = mat->cmap->N / bs; 1855b1683b59SHong Zhang c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1856e88777f2SHong Zhang c->N = Nbs; 1857e88777f2SHong Zhang c->m = c->M; 1858b1683b59SHong Zhang c->rstart = 0; 1859e88777f2SHong Zhang c->brows = 100; 1860b1683b59SHong Zhang 1861b1683b59SHong Zhang c->ncolors = nis; 18629566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 18639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 18649566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1865e88777f2SHong Zhang 1866e88777f2SHong Zhang brows = c->brows; 18679566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1868e88777f2SHong Zhang if (flg) c->brows = brows; 186948a46eb9SPierre Jolivet if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 18702205254eSKarl Rupp 1871d2853540SHong Zhang colorforrow[0] = 0; 1872d2853540SHong Zhang rows_i = rows; 1873f99a636bSHong Zhang den2sp_i = den2sp; 1874b1683b59SHong Zhang 18759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 18769566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Nbs + 1, &columns)); 18772205254eSKarl Rupp 1878d2853540SHong Zhang colorforcol[0] = 0; 1879d2853540SHong Zhang columns_i = columns; 1880d2853540SHong Zhang 1881077f23c2SHong Zhang /* get column-wise storage of mat */ 18829566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1883b1683b59SHong Zhang 1884b28d80bdSHong Zhang cm = c->m; 18859566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cm + 1, &rowhit)); 18869566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1887f99a636bSHong Zhang for (i = 0; i < nis; i++) { /* loop over color */ 18889566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isa[i], &n)); 18899566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isa[i], &is)); 18902205254eSKarl Rupp 1891b1683b59SHong Zhang c->ncolumns[i] = n; 18921baa6e33SBarry Smith if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1893d2853540SHong Zhang colorforcol[i + 1] = colorforcol[i] + n; 1894d2853540SHong Zhang columns_i += n; 1895b1683b59SHong Zhang 1896b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 18979566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rowhit, cm)); 1898e99be685SHong Zhang 1899b7caf3d6SHong Zhang for (j = 0; j < n; j++) { /* loop over columns*/ 1900b1683b59SHong Zhang col = is[j]; 1901d2853540SHong Zhang row_idx = cj + ci[col]; 1902b1683b59SHong Zhang m = ci[col + 1] - ci[col]; 1903b7caf3d6SHong Zhang for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1904e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1905d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1906b1683b59SHong Zhang } 1907b1683b59SHong Zhang } 1908b1683b59SHong Zhang /* count the number of hits */ 1909b1683b59SHong Zhang nrows = 0; 1910e8354b3eSHong Zhang for (j = 0; j < cm; j++) { 1911b1683b59SHong Zhang if (rowhit[j]) nrows++; 1912b1683b59SHong Zhang } 1913b1683b59SHong Zhang c->nrows[i] = nrows; 1914d2853540SHong Zhang colorforrow[i + 1] = colorforrow[i] + nrows; 1915d2853540SHong Zhang 1916b1683b59SHong Zhang nrows = 0; 1917b7caf3d6SHong Zhang for (j = 0; j < cm; j++) { /* loop over rows */ 1918b1683b59SHong Zhang if (rowhit[j]) { 1919d2853540SHong Zhang rows_i[nrows] = j; 192012b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1921b1683b59SHong Zhang nrows++; 1922b1683b59SHong Zhang } 1923b1683b59SHong Zhang } 1924e88777f2SHong Zhang den2sp_i += nrows; 1925077f23c2SHong Zhang 19269566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isa[i], &is)); 1927f99a636bSHong Zhang rows_i += nrows; 1928b1683b59SHong Zhang } 19299566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 19309566063dSJacob Faibussowitsch PetscCall(PetscFree(rowhit)); 19319566063dSJacob Faibussowitsch PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 19322c71b3e2SJacob Faibussowitsch PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1933b28d80bdSHong Zhang 1934d2853540SHong Zhang c->colorforrow = colorforrow; 1935d2853540SHong Zhang c->rows = rows; 1936f99a636bSHong Zhang c->den2sp = den2sp; 1937d2853540SHong Zhang c->colorforcol = colorforcol; 1938d2853540SHong Zhang c->columns = columns; 19392205254eSKarl Rupp 19409566063dSJacob Faibussowitsch PetscCall(PetscFree(idxhit)); 19413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1942b1683b59SHong Zhang } 1943d013fc79Sandi selinger 1944d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1945d71ae5a4SJacob Faibussowitsch { 19464222ddf1SHong Zhang Mat_Product *product = C->product; 19474222ddf1SHong Zhang Mat A = product->A, B = product->B; 19484222ddf1SHong Zhang 1949df97dc6dSFande Kong PetscFunctionBegin; 19504222ddf1SHong Zhang if (C->ops->mattransposemultnumeric) { 19514222ddf1SHong Zhang /* Alg: "outerproduct" */ 19529566063dSJacob Faibussowitsch PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 19534222ddf1SHong Zhang } else { 19544222ddf1SHong Zhang /* Alg: "matmatmult" -- C = At*B */ 19556718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 19564222ddf1SHong Zhang 195728b400f6SJacob Faibussowitsch PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 19581cdffd5eSHong Zhang if (atb->At) { 19591cdffd5eSHong Zhang /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 19601cdffd5eSHong Zhang user may have called MatProductReplaceMats() to get this A=product->A */ 19611cdffd5eSHong Zhang PetscCall(MatTransposeSetPrecursor(A, atb->At)); 19627fb60732SBarry Smith PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 19634222ddf1SHong Zhang } 19647fb60732SBarry Smith PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 19654222ddf1SHong Zhang } 19663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1967df97dc6dSFande Kong } 1968df97dc6dSFande Kong 1969d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1970d71ae5a4SJacob Faibussowitsch { 19714222ddf1SHong Zhang Mat_Product *product = C->product; 19724222ddf1SHong Zhang Mat A = product->A, B = product->B; 19734222ddf1SHong Zhang PetscReal fill = product->fill; 1974d013fc79Sandi selinger 1975d013fc79Sandi selinger PetscFunctionBegin; 19769566063dSJacob Faibussowitsch PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 19772869b61bSandi selinger 19784222ddf1SHong Zhang C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 19793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19802869b61bSandi selinger } 1981d013fc79Sandi selinger 1982d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1983d71ae5a4SJacob Faibussowitsch { 19844222ddf1SHong Zhang Mat_Product *product = C->product; 19854222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 19864222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 19874222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 19884222ddf1SHong Zhang const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 19894222ddf1SHong Zhang PetscInt nalg = 7; 19904222ddf1SHong Zhang #else 19914222ddf1SHong Zhang const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 19924222ddf1SHong Zhang PetscInt nalg = 8; 19934222ddf1SHong Zhang #endif 19944222ddf1SHong Zhang 19954222ddf1SHong Zhang PetscFunctionBegin; 19964222ddf1SHong Zhang /* Set default algorithm */ 19979566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 199848a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 1999d013fc79Sandi selinger 20004222ddf1SHong Zhang /* Get runtime option */ 20014222ddf1SHong Zhang if (product->api_user) { 2002d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 20039566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2004d0609cedSBarry Smith PetscOptionsEnd(); 20054222ddf1SHong Zhang } else { 2006d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 20079566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2008d0609cedSBarry Smith PetscOptionsEnd(); 2009d013fc79Sandi selinger } 201048a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2011d013fc79Sandi selinger 20124222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 20134222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 20143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20154222ddf1SHong Zhang } 2016d013fc79Sandi selinger 2017d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2018d71ae5a4SJacob Faibussowitsch { 20194222ddf1SHong Zhang Mat_Product *product = C->product; 20204222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20214222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 2022089a957eSStefano Zampini const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2023089a957eSStefano Zampini PetscInt nalg = 3; 2024d013fc79Sandi selinger 20254222ddf1SHong Zhang PetscFunctionBegin; 20264222ddf1SHong Zhang /* Get runtime option */ 20274222ddf1SHong Zhang if (product->api_user) { 2028d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 20299566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2030d0609cedSBarry Smith PetscOptionsEnd(); 20314222ddf1SHong Zhang } else { 2032d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 20339566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2034d0609cedSBarry Smith PetscOptionsEnd(); 20354222ddf1SHong Zhang } 203648a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2037d013fc79Sandi selinger 20384222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 20393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20404222ddf1SHong Zhang } 20414222ddf1SHong Zhang 2042d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2043d71ae5a4SJacob Faibussowitsch { 20444222ddf1SHong Zhang Mat_Product *product = C->product; 20454222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20464222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20474222ddf1SHong Zhang const char *algTypes[2] = {"default", "color"}; 20484222ddf1SHong Zhang PetscInt nalg = 2; 20494222ddf1SHong Zhang 20504222ddf1SHong Zhang PetscFunctionBegin; 20514222ddf1SHong Zhang /* Set default algorithm */ 20529566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 20534222ddf1SHong Zhang if (!flg) { 20544222ddf1SHong Zhang alg = 1; 20559566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20564222ddf1SHong Zhang } 20574222ddf1SHong Zhang 20584222ddf1SHong Zhang /* Get runtime option */ 20594222ddf1SHong Zhang if (product->api_user) { 2060d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 20619566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2062d0609cedSBarry Smith PetscOptionsEnd(); 20634222ddf1SHong Zhang } else { 2064d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 20659566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2066d0609cedSBarry Smith PetscOptionsEnd(); 20674222ddf1SHong Zhang } 206848a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20694222ddf1SHong Zhang 20704222ddf1SHong Zhang C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 20714222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABt; 20723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20734222ddf1SHong Zhang } 20744222ddf1SHong Zhang 2075d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2076d71ae5a4SJacob Faibussowitsch { 20774222ddf1SHong Zhang Mat_Product *product = C->product; 20784222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20794222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 20804222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 20814222ddf1SHong Zhang const char *algTypes[2] = {"scalable", "rap"}; 20824222ddf1SHong Zhang PetscInt nalg = 2; 20834222ddf1SHong Zhang #else 20844222ddf1SHong Zhang const char *algTypes[3] = {"scalable", "rap", "hypre"}; 20854222ddf1SHong Zhang PetscInt nalg = 3; 20864222ddf1SHong Zhang #endif 20874222ddf1SHong Zhang 20884222ddf1SHong Zhang PetscFunctionBegin; 20894222ddf1SHong Zhang /* Set default algorithm */ 20909566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 209148a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20924222ddf1SHong Zhang 20934222ddf1SHong Zhang /* Get runtime option */ 20944222ddf1SHong Zhang if (product->api_user) { 2095d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 20969566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2097d0609cedSBarry Smith PetscOptionsEnd(); 20984222ddf1SHong Zhang } else { 2099d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 21009566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2101d0609cedSBarry Smith PetscOptionsEnd(); 21024222ddf1SHong Zhang } 210348a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21044222ddf1SHong Zhang 21054222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 21063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21074222ddf1SHong Zhang } 21084222ddf1SHong Zhang 2109d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2110d71ae5a4SJacob Faibussowitsch { 21114222ddf1SHong Zhang Mat_Product *product = C->product; 21124222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21134222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21144222ddf1SHong Zhang const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 21154222ddf1SHong Zhang PetscInt nalg = 3; 21164222ddf1SHong Zhang 21174222ddf1SHong Zhang PetscFunctionBegin; 21184222ddf1SHong Zhang /* Set default algorithm */ 21199566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 212048a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21214222ddf1SHong Zhang 21224222ddf1SHong Zhang /* Get runtime option */ 21234222ddf1SHong Zhang if (product->api_user) { 2124d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 21259566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2126d0609cedSBarry Smith PetscOptionsEnd(); 21274222ddf1SHong Zhang } else { 2128d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 21299566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2130d0609cedSBarry Smith PetscOptionsEnd(); 21314222ddf1SHong Zhang } 213248a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21334222ddf1SHong Zhang 21344222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 21353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21364222ddf1SHong Zhang } 21374222ddf1SHong Zhang 21384222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2139d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2140d71ae5a4SJacob Faibussowitsch { 21414222ddf1SHong Zhang Mat_Product *product = C->product; 21424222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21434222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21444222ddf1SHong Zhang const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 21454222ddf1SHong Zhang PetscInt nalg = 7; 21464222ddf1SHong Zhang 21474222ddf1SHong Zhang PetscFunctionBegin; 21484222ddf1SHong Zhang /* Set default algorithm */ 21499566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 215048a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21514222ddf1SHong Zhang 21524222ddf1SHong Zhang /* Get runtime option */ 21534222ddf1SHong Zhang if (product->api_user) { 2154d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 21559566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2156d0609cedSBarry Smith PetscOptionsEnd(); 21574222ddf1SHong Zhang } else { 2158d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 21599566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2160d0609cedSBarry Smith PetscOptionsEnd(); 21614222ddf1SHong Zhang } 216248a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21634222ddf1SHong Zhang 21644222ddf1SHong Zhang C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 21654222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABC; 21663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21674222ddf1SHong Zhang } 21684222ddf1SHong Zhang 2169d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2170d71ae5a4SJacob Faibussowitsch { 21714222ddf1SHong Zhang Mat_Product *product = C->product; 21724222ddf1SHong Zhang 21734222ddf1SHong Zhang PetscFunctionBegin; 21744222ddf1SHong Zhang switch (product->type) { 2175d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB: 2176d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2177d71ae5a4SJacob Faibussowitsch break; 2178d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB: 2179d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2180d71ae5a4SJacob Faibussowitsch break; 2181d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 2182d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2183d71ae5a4SJacob Faibussowitsch break; 2184d71ae5a4SJacob Faibussowitsch case MATPRODUCT_PtAP: 2185d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2186d71ae5a4SJacob Faibussowitsch break; 2187d71ae5a4SJacob Faibussowitsch case MATPRODUCT_RARt: 2188d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2189d71ae5a4SJacob Faibussowitsch break; 2190d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 2191d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2192d71ae5a4SJacob Faibussowitsch break; 2193d71ae5a4SJacob Faibussowitsch default: 2194d71ae5a4SJacob Faibussowitsch break; 21954222ddf1SHong Zhang } 21963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2197d013fc79Sandi selinger } 2198