1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 13d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 14d71ae5a4SJacob Faibussowitsch { 15df97dc6dSFande Kong PetscFunctionBegin; 16dbbe0bcdSBarry Smith if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 17dbbe0bcdSBarry Smith else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19df97dc6dSFande Kong } 20df97dc6dSFande Kong 214222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */ 22d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 23d71ae5a4SJacob Faibussowitsch { 244222ddf1SHong Zhang PetscInt ii; 254222ddf1SHong Zhang Mat_SeqAIJ *aij; 26cbc6b225SStefano Zampini PetscBool isseqaij, osingle, ofree_a, ofree_ij; 275c66b693SKris Buschelman 285c66b693SKris Buschelman PetscFunctionBegin; 29aed4548fSBarry Smith PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 309566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mat, m, n, m, n)); 314222ddf1SHong Zhang 32e4e71118SStefano Zampini if (!mtype) { 339566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 349566063dSJacob Faibussowitsch if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 35e4e71118SStefano Zampini } else { 369566063dSJacob Faibussowitsch PetscCall(MatSetType(mat, mtype)); 37e4e71118SStefano Zampini } 38cbc6b225SStefano Zampini 394222ddf1SHong Zhang aij = (Mat_SeqAIJ *)(mat)->data; 40cbc6b225SStefano Zampini osingle = aij->singlemalloc; 41cbc6b225SStefano Zampini ofree_a = aij->free_a; 42cbc6b225SStefano Zampini ofree_ij = aij->free_ij; 43cbc6b225SStefano Zampini /* changes the free flags */ 449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 45cbc6b225SStefano Zampini 469566063dSJacob Faibussowitsch PetscCall(PetscFree(aij->ilen)); 479566063dSJacob Faibussowitsch PetscCall(PetscFree(aij->imax)); 489566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aij->imax)); 499566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aij->ilen)); 50cbc6b225SStefano Zampini for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 51cbc6b225SStefano Zampini const PetscInt rnz = i[ii + 1] - i[ii]; 52cbc6b225SStefano Zampini aij->nonzerorowcnt += !!rnz; 53cbc6b225SStefano Zampini aij->rmax = PetscMax(aij->rmax, rnz); 54cbc6b225SStefano Zampini aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 55cbc6b225SStefano Zampini } 56cbc6b225SStefano Zampini aij->maxnz = i[m]; 57cbc6b225SStefano Zampini aij->nz = i[m]; 584222ddf1SHong Zhang 59cbc6b225SStefano Zampini if (osingle) { 609566063dSJacob Faibussowitsch PetscCall(PetscFree3(aij->a, aij->j, aij->i)); 61cbc6b225SStefano Zampini } else { 629566063dSJacob Faibussowitsch if (ofree_a) PetscCall(PetscFree(aij->a)); 639566063dSJacob Faibussowitsch if (ofree_ij) PetscCall(PetscFree(aij->j)); 649566063dSJacob Faibussowitsch if (ofree_ij) PetscCall(PetscFree(aij->i)); 65cbc6b225SStefano Zampini } 664222ddf1SHong Zhang aij->i = i; 674222ddf1SHong Zhang aij->j = j; 684222ddf1SHong Zhang aij->a = a; 694222ddf1SHong Zhang aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 70cbc6b225SStefano Zampini /* default to not retain ownership */ 71cbc6b225SStefano Zampini aij->singlemalloc = PETSC_FALSE; 724222ddf1SHong Zhang aij->free_a = PETSC_FALSE; 734222ddf1SHong Zhang aij->free_ij = PETSC_FALSE; 749566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 765c66b693SKris Buschelman } 771c24bd37SHong Zhang 78d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 79d71ae5a4SJacob Faibussowitsch { 804222ddf1SHong Zhang Mat_Product *product = C->product; 814222ddf1SHong Zhang MatProductAlgorithm alg; 824222ddf1SHong Zhang PetscBool flg; 834222ddf1SHong Zhang 844222ddf1SHong Zhang PetscFunctionBegin; 854222ddf1SHong Zhang if (product) { 864222ddf1SHong Zhang alg = product->alg; 874222ddf1SHong Zhang } else { 884222ddf1SHong Zhang alg = "sorted"; 894222ddf1SHong Zhang } 904222ddf1SHong Zhang /* sorted */ 919566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "sorted", &flg)); 924222ddf1SHong Zhang if (flg) { 939566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 954222ddf1SHong Zhang } 964222ddf1SHong Zhang 974222ddf1SHong Zhang /* scalable */ 989566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "scalable", &flg)); 994222ddf1SHong Zhang if (flg) { 1009566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 1013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1024222ddf1SHong Zhang } 1034222ddf1SHong Zhang 1044222ddf1SHong Zhang /* scalable_fast */ 1059566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 1064222ddf1SHong Zhang if (flg) { 1079566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 1083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1094222ddf1SHong Zhang } 1104222ddf1SHong Zhang 1114222ddf1SHong Zhang /* heap */ 1129566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "heap", &flg)); 1134222ddf1SHong Zhang if (flg) { 1149566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 1153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1164222ddf1SHong Zhang } 1174222ddf1SHong Zhang 1184222ddf1SHong Zhang /* btheap */ 1199566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "btheap", &flg)); 1204222ddf1SHong Zhang if (flg) { 1219566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 1223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1234222ddf1SHong Zhang } 1244222ddf1SHong Zhang 1254222ddf1SHong Zhang /* llcondensed */ 1269566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 1274222ddf1SHong Zhang if (flg) { 1289566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 1293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1304222ddf1SHong Zhang } 1314222ddf1SHong Zhang 1324222ddf1SHong Zhang /* rowmerge */ 1339566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 1344222ddf1SHong Zhang if (flg) { 1359566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 1363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1374222ddf1SHong Zhang } 1384222ddf1SHong Zhang 1394222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE) 1409566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "hypre", &flg)); 1414222ddf1SHong Zhang if (flg) { 1429566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 1433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1444222ddf1SHong Zhang } 1454222ddf1SHong Zhang #endif 1464222ddf1SHong Zhang 1474222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1484222ddf1SHong Zhang } 1494222ddf1SHong Zhang 150d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 151d71ae5a4SJacob Faibussowitsch { 152b561aa9dSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 153971236abSHong Zhang PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 154c1ba5575SJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 155b561aa9dSHong Zhang PetscReal afill; 156eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 157eec179cfSJacob Faibussowitsch PetscHMapI ta; 158fb908031SHong Zhang PetscBT lnkbt; 1590298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 160b561aa9dSHong Zhang 161b561aa9dSHong Zhang PetscFunctionBegin; 162bd958071SHong Zhang /* Get ci and cj */ 163fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 164fb908031SHong Zhang /* free space for accumulating nonzero column info */ 1659566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 166fb908031SHong Zhang ci[0] = 0; 167fb908031SHong Zhang 168fb908031SHong Zhang /* create and initialize a linked list */ 169eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 170c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 171eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 172eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 173eca6b86aSHong Zhang 1749566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 175fb908031SHong Zhang 176fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1779566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 1782205254eSKarl Rupp 179fb908031SHong Zhang current_space = free_space; 180fb908031SHong Zhang 181fb908031SHong Zhang /* Determine ci and cj */ 182fb908031SHong Zhang for (i = 0; i < am; i++) { 183fb908031SHong Zhang anzi = ai[i + 1] - ai[i]; 184fb908031SHong Zhang aj = a->j + ai[i]; 185fb908031SHong Zhang for (j = 0; j < anzi; j++) { 186fb908031SHong Zhang brow = aj[j]; 187fb908031SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 188fb908031SHong Zhang bj = b->j + bi[brow]; 189fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 1909566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 191fb908031SHong Zhang } 1928c78258cSHong Zhang /* add possible missing diagonal entry */ 19348a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 194fb908031SHong Zhang cnzi = lnk[0]; 195fb908031SHong Zhang 196fb908031SHong Zhang /* If free space is not available, make more free space */ 197fb908031SHong Zhang /* Double the amount of total space in the list */ 198fb908031SHong Zhang if (current_space->local_remaining < cnzi) { 1999566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 200fb908031SHong Zhang ndouble++; 201fb908031SHong Zhang } 202fb908031SHong Zhang 203fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 2049566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 2052205254eSKarl Rupp 206fb908031SHong Zhang current_space->array += cnzi; 207fb908031SHong Zhang current_space->local_used += cnzi; 208fb908031SHong Zhang current_space->local_remaining -= cnzi; 2092205254eSKarl Rupp 210fb908031SHong Zhang ci[i + 1] = ci[i] + cnzi; 211fb908031SHong Zhang } 212fb908031SHong Zhang 213fb908031SHong Zhang /* Column indices are in the list of free space */ 214fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 215fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 2169566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 2179566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 2189566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 219b561aa9dSHong Zhang 22026be0446SHong Zhang /* put together the new symbolic matrix */ 2219566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 2229566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 22358c24d83SHong Zhang 22458c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 22558c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 2264222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 227aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 228e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 22958c24d83SHong Zhang c->nonew = 0; 2304222ddf1SHong Zhang 2314222ddf1SHong Zhang /* fast, needs non-scalable O(bn) array 'abdense' */ 2324222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 2330b7e3e3dSHong Zhang 2347212da7cSHong Zhang /* set MatInfo */ 2357212da7cSHong Zhang afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 236f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 2374222ddf1SHong Zhang C->info.mallocs = ndouble; 2384222ddf1SHong Zhang C->info.fill_ratio_given = fill; 2394222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 2407212da7cSHong Zhang 2417212da7cSHong Zhang #if defined(PETSC_USE_INFO) 2427212da7cSHong Zhang if (ci[am]) { 2439566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 2449566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 245f2b054eeSHong Zhang } else { 2469566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 247be0fcf8dSHong Zhang } 248f2b054eeSHong Zhang #endif 2493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25058c24d83SHong Zhang } 251d50806bdSBarry Smith 252d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 253d71ae5a4SJacob Faibussowitsch { 254d13dce4bSSatish Balay PetscLogDouble flops = 0.0; 255d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 256d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 257d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 25838baddfdSBarry Smith PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 259c58ca2e3SHong Zhang PetscInt am = A->rmap->n, cm = C->rmap->n; 260519eb980SHong Zhang PetscInt i, j, k, anzi, bnzi, cnzi, brow; 2612e5835c6SStefano Zampini PetscScalar *ca, valtmp; 262aa1aec99SHong Zhang PetscScalar *ab_dense; 2636718818eSStefano Zampini PetscContainer cab_dense; 2642e5835c6SStefano Zampini const PetscScalar *aa, *ba, *baj; 265d50806bdSBarry Smith 266d50806bdSBarry Smith PetscFunctionBegin; 2679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 2689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 269aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 2709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 271aa1aec99SHong Zhang c->a = ca; 272aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 2736718818eSStefano Zampini } else ca = c->a; 2746718818eSStefano Zampini 2756718818eSStefano Zampini /* TODO this should be done in the symbolic phase */ 2766718818eSStefano Zampini /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 2776718818eSStefano Zampini that is hard to eradicate) */ 2789566063dSJacob Faibussowitsch PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 2796718818eSStefano Zampini if (!cab_dense) { 2809566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 2819566063dSJacob Faibussowitsch PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 2829566063dSJacob Faibussowitsch PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 2839566063dSJacob Faibussowitsch PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 2849566063dSJacob Faibussowitsch PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 2859566063dSJacob Faibussowitsch PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 286d90d86d0SMatthew G. Knepley } 2879566063dSJacob Faibussowitsch PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 2889566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 289aa1aec99SHong Zhang 290c124e916SHong Zhang /* clean old values in C */ 2919566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 292d50806bdSBarry Smith /* Traverse A row-wise. */ 293d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 294d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 295d50806bdSBarry Smith for (i = 0; i < am; i++) { 296d50806bdSBarry Smith anzi = ai[i + 1] - ai[i]; 297d50806bdSBarry Smith for (j = 0; j < anzi; j++) { 298519eb980SHong Zhang brow = aj[j]; 299d50806bdSBarry Smith bnzi = bi[brow + 1] - bi[brow]; 300d50806bdSBarry Smith bjj = bj + bi[brow]; 301d50806bdSBarry Smith baj = ba + bi[brow]; 302519eb980SHong Zhang /* perform dense axpy */ 30336ec6d2dSHong Zhang valtmp = aa[j]; 304ad540459SPierre Jolivet for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 305519eb980SHong Zhang flops += 2 * bnzi; 306519eb980SHong Zhang } 3079371c9d4SSatish Balay aj += anzi; 3089371c9d4SSatish Balay aa += anzi; 309c58ca2e3SHong Zhang 310c58ca2e3SHong Zhang cnzi = ci[i + 1] - ci[i]; 311519eb980SHong Zhang for (k = 0; k < cnzi; k++) { 312519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 313519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 314519eb980SHong Zhang } 315519eb980SHong Zhang flops += cnzi; 3169371c9d4SSatish Balay cj += cnzi; 3179371c9d4SSatish Balay ca += cnzi; 318519eb980SHong Zhang } 3192e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3202e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3212e5835c6SStefano Zampini #endif 3229566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 3239566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 3249566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 3259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 3269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 3273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 328c58ca2e3SHong Zhang } 329c58ca2e3SHong Zhang 330d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 331d71ae5a4SJacob Faibussowitsch { 332c58ca2e3SHong Zhang PetscLogDouble flops = 0.0; 333c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 334c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 335c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 336c58ca2e3SHong Zhang PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 337c58ca2e3SHong Zhang PetscInt am = A->rmap->N, cm = C->rmap->N; 338c58ca2e3SHong Zhang PetscInt i, j, k, anzi, bnzi, cnzi, brow; 3392e5835c6SStefano Zampini PetscScalar *ca = c->a, valtmp; 3402e5835c6SStefano Zampini const PetscScalar *aa, *ba, *baj; 341c58ca2e3SHong Zhang PetscInt nextb; 342c58ca2e3SHong Zhang 343c58ca2e3SHong Zhang PetscFunctionBegin; 3449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 3459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 346cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 3479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 348cf742fccSHong Zhang c->a = ca; 349cf742fccSHong Zhang c->free_a = PETSC_TRUE; 350cf742fccSHong Zhang } 351cf742fccSHong Zhang 352c58ca2e3SHong Zhang /* clean old values in C */ 3539566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 354c58ca2e3SHong Zhang /* Traverse A row-wise. */ 355c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 356c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 357519eb980SHong Zhang for (i = 0; i < am; i++) { 358519eb980SHong Zhang anzi = ai[i + 1] - ai[i]; 359519eb980SHong Zhang cnzi = ci[i + 1] - ci[i]; 360519eb980SHong Zhang for (j = 0; j < anzi; j++) { 361519eb980SHong Zhang brow = aj[j]; 362519eb980SHong Zhang bnzi = bi[brow + 1] - bi[brow]; 363519eb980SHong Zhang bjj = bj + bi[brow]; 364519eb980SHong Zhang baj = ba + bi[brow]; 365519eb980SHong Zhang /* perform sparse axpy */ 36636ec6d2dSHong Zhang valtmp = aa[j]; 367c124e916SHong Zhang nextb = 0; 368c124e916SHong Zhang for (k = 0; nextb < bnzi; k++) { 369c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 37036ec6d2dSHong Zhang ca[k] += valtmp * baj[nextb++]; 371c124e916SHong Zhang } 372d50806bdSBarry Smith } 373d50806bdSBarry Smith flops += 2 * bnzi; 374d50806bdSBarry Smith } 3759371c9d4SSatish Balay aj += anzi; 3769371c9d4SSatish Balay aa += anzi; 3779371c9d4SSatish Balay cj += cnzi; 3789371c9d4SSatish Balay ca += cnzi; 379d50806bdSBarry Smith } 3802e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3812e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3822e5835c6SStefano Zampini #endif 3839566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 3849566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 3859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 3869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 3879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 3883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 389d50806bdSBarry Smith } 390bc011b1eSHong Zhang 391d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 392d71ae5a4SJacob Faibussowitsch { 39325296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 39425296bd5SBarry Smith PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 3953c50cad2SHong Zhang PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 39625296bd5SBarry Smith MatScalar *ca; 39725296bd5SBarry Smith PetscReal afill; 398eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 399eec179cfSJacob Faibussowitsch PetscHMapI ta; 4000298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 40125296bd5SBarry Smith 40225296bd5SBarry Smith PetscFunctionBegin; 4033c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 4043c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 4059566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 4063c50cad2SHong Zhang ci[0] = 0; 4073c50cad2SHong Zhang 4083c50cad2SHong Zhang /* create and initialize a linked list */ 409eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 410c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 411eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 412eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 413eca6b86aSHong Zhang 4149566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 4153c50cad2SHong Zhang 4163c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 4179566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 4183c50cad2SHong Zhang current_space = free_space; 4193c50cad2SHong Zhang 4203c50cad2SHong Zhang /* Determine ci and cj */ 4213c50cad2SHong Zhang for (i = 0; i < am; i++) { 4223c50cad2SHong Zhang anzi = ai[i + 1] - ai[i]; 4233c50cad2SHong Zhang aj = a->j + ai[i]; 4243c50cad2SHong Zhang for (j = 0; j < anzi; j++) { 4253c50cad2SHong Zhang brow = aj[j]; 4263c50cad2SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 4273c50cad2SHong Zhang bj = b->j + bi[brow]; 4283c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 4299566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 4303c50cad2SHong Zhang } 4318c78258cSHong Zhang /* add possible missing diagonal entry */ 43248a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 4333c50cad2SHong Zhang cnzi = lnk[1]; 4343c50cad2SHong Zhang 4353c50cad2SHong Zhang /* If free space is not available, make more free space */ 4363c50cad2SHong Zhang /* Double the amount of total space in the list */ 4373c50cad2SHong Zhang if (current_space->local_remaining < cnzi) { 4389566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 4393c50cad2SHong Zhang ndouble++; 4403c50cad2SHong Zhang } 4413c50cad2SHong Zhang 4423c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 4439566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 4442205254eSKarl Rupp 4453c50cad2SHong Zhang current_space->array += cnzi; 4463c50cad2SHong Zhang current_space->local_used += cnzi; 4473c50cad2SHong Zhang current_space->local_remaining -= cnzi; 4482205254eSKarl Rupp 4493c50cad2SHong Zhang ci[i + 1] = ci[i] + cnzi; 4503c50cad2SHong Zhang } 4513c50cad2SHong Zhang 4523c50cad2SHong Zhang /* Column indices are in the list of free space */ 4533c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 4543c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 4559566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 4569566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 4579566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy_fast(lnk)); 45825296bd5SBarry Smith 45925296bd5SBarry Smith /* Allocate space for ca */ 4609566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 46125296bd5SBarry Smith 46225296bd5SBarry Smith /* put together the new symbolic matrix */ 4639566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 4649566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 46525296bd5SBarry Smith 46625296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 46725296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 4684222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 46925296bd5SBarry Smith c->free_a = PETSC_TRUE; 47025296bd5SBarry Smith c->free_ij = PETSC_TRUE; 47125296bd5SBarry Smith c->nonew = 0; 4722205254eSKarl Rupp 4734222ddf1SHong Zhang /* slower, less memory */ 4744222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 47525296bd5SBarry Smith 47625296bd5SBarry Smith /* set MatInfo */ 47725296bd5SBarry Smith afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 47825296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 4794222ddf1SHong Zhang C->info.mallocs = ndouble; 4804222ddf1SHong Zhang C->info.fill_ratio_given = fill; 4814222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 48225296bd5SBarry Smith 48325296bd5SBarry Smith #if defined(PETSC_USE_INFO) 48425296bd5SBarry Smith if (ci[am]) { 4859566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 4869566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 48725296bd5SBarry Smith } else { 4889566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 48925296bd5SBarry Smith } 49025296bd5SBarry Smith #endif 4913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 49225296bd5SBarry Smith } 49325296bd5SBarry Smith 494d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 495d71ae5a4SJacob Faibussowitsch { 496e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 497bf9555e6SHong Zhang PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 49825c41797SHong Zhang PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 499e9e4536cSHong Zhang MatScalar *ca; 500e9e4536cSHong Zhang PetscReal afill; 501eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 502eec179cfSJacob Faibussowitsch PetscHMapI ta; 5030298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 504e9e4536cSHong Zhang 505e9e4536cSHong Zhang PetscFunctionBegin; 506bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 507bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 5089566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 509bd958071SHong Zhang ci[0] = 0; 510bd958071SHong Zhang 511bd958071SHong Zhang /* create and initialize a linked list */ 512eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 513c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 514eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 515eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 5169566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 517bd958071SHong Zhang 518bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 5199566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 520bd958071SHong Zhang current_space = free_space; 521bd958071SHong Zhang 522bd958071SHong Zhang /* Determine ci and cj */ 523bd958071SHong Zhang for (i = 0; i < am; i++) { 524bd958071SHong Zhang anzi = ai[i + 1] - ai[i]; 525bd958071SHong Zhang aj = a->j + ai[i]; 526bd958071SHong Zhang for (j = 0; j < anzi; j++) { 527bd958071SHong Zhang brow = aj[j]; 528bd958071SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 529bd958071SHong Zhang bj = b->j + bi[brow]; 530bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 5319566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 532bd958071SHong Zhang } 5338c78258cSHong Zhang /* add possible missing diagonal entry */ 53448a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 5358c78258cSHong Zhang 536bd958071SHong Zhang cnzi = lnk[0]; 537bd958071SHong Zhang 538bd958071SHong Zhang /* If free space is not available, make more free space */ 539bd958071SHong Zhang /* Double the amount of total space in the list */ 540bd958071SHong Zhang if (current_space->local_remaining < cnzi) { 5419566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 542bd958071SHong Zhang ndouble++; 543bd958071SHong Zhang } 544bd958071SHong Zhang 545bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 5469566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 5472205254eSKarl Rupp 548bd958071SHong Zhang current_space->array += cnzi; 549bd958071SHong Zhang current_space->local_used += cnzi; 550bd958071SHong Zhang current_space->local_remaining -= cnzi; 5512205254eSKarl Rupp 552bd958071SHong Zhang ci[i + 1] = ci[i] + cnzi; 553bd958071SHong Zhang } 554bd958071SHong Zhang 555bd958071SHong Zhang /* Column indices are in the list of free space */ 556bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 557bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 5589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 5599566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 5609566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 561e9e4536cSHong Zhang 562e9e4536cSHong Zhang /* Allocate space for ca */ 5639566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 564e9e4536cSHong Zhang 565e9e4536cSHong Zhang /* put together the new symbolic matrix */ 5669566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 5679566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 568e9e4536cSHong Zhang 569e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 570e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5714222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 572e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 573e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 574e9e4536cSHong Zhang c->nonew = 0; 5752205254eSKarl Rupp 5764222ddf1SHong Zhang /* slower, less memory */ 5774222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 578e9e4536cSHong Zhang 579e9e4536cSHong Zhang /* set MatInfo */ 580e9e4536cSHong Zhang afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 581e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 5824222ddf1SHong Zhang C->info.mallocs = ndouble; 5834222ddf1SHong Zhang C->info.fill_ratio_given = fill; 5844222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 585e9e4536cSHong Zhang 586e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 587e9e4536cSHong Zhang if (ci[am]) { 5889566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 5899566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 590e9e4536cSHong Zhang } else { 5919566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 592e9e4536cSHong Zhang } 593e9e4536cSHong Zhang #endif 5943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 595e9e4536cSHong Zhang } 596e9e4536cSHong Zhang 597d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 598d71ae5a4SJacob Faibussowitsch { 5990ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 6000ced3a2bSJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 6010ced3a2bSJed Brown PetscInt *ci, *cj, *bb; 6020ced3a2bSJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 6030ced3a2bSJed Brown PetscReal afill; 6040ced3a2bSJed Brown PetscInt i, j, col, ndouble = 0; 6050298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 6060ced3a2bSJed Brown PetscHeap h; 6070ced3a2bSJed Brown 6080ced3a2bSJed Brown PetscFunctionBegin; 609cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 6100ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 6119566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 6120ced3a2bSJed Brown ci[0] = 0; 6130ced3a2bSJed Brown 6140ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 6159566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 6160ced3a2bSJed Brown current_space = free_space; 6170ced3a2bSJed Brown 6189566063dSJacob Faibussowitsch PetscCall(PetscHeapCreate(a->rmax, &h)); 6199566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->rmax, &bb)); 6200ced3a2bSJed Brown 6210ced3a2bSJed Brown /* Determine ci and cj */ 6220ced3a2bSJed Brown for (i = 0; i < am; i++) { 6230ced3a2bSJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6240ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6250ced3a2bSJed Brown ci[i + 1] = ci[i]; 6260ced3a2bSJed Brown /* Populate the min heap */ 6270ced3a2bSJed Brown for (j = 0; j < anzi; j++) { 6280ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 6290ced3a2bSJed Brown if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 6309566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 6310ced3a2bSJed Brown } 6320ced3a2bSJed Brown } 6330ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 6349566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 6350ced3a2bSJed Brown while (j >= 0) { 6360ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 6379566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 6380ced3a2bSJed Brown ndouble++; 6390ced3a2bSJed Brown } 6400ced3a2bSJed Brown *(current_space->array++) = col; 6410ced3a2bSJed Brown current_space->local_used++; 6420ced3a2bSJed Brown current_space->local_remaining--; 6430ced3a2bSJed Brown ci[i + 1]++; 6440ced3a2bSJed Brown 6450ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 6469566063dSJacob Faibussowitsch if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 6470ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 6480ced3a2bSJed Brown PetscInt j2, col2; 6499566063dSJacob Faibussowitsch PetscCall(PetscHeapPeek(h, &j2, &col2)); 6500ced3a2bSJed Brown if (col2 != col) break; 6519566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j2, &col2)); 6529566063dSJacob Faibussowitsch if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 6530ced3a2bSJed Brown } 6540ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 6559566063dSJacob Faibussowitsch PetscCall(PetscHeapUnstash(h)); 6569566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 6570ced3a2bSJed Brown } 6580ced3a2bSJed Brown } 6599566063dSJacob Faibussowitsch PetscCall(PetscFree(bb)); 6609566063dSJacob Faibussowitsch PetscCall(PetscHeapDestroy(&h)); 6610ced3a2bSJed Brown 6620ced3a2bSJed Brown /* Column indices are in the list of free space */ 6630ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 6640ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 6659566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am], &cj)); 6669566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 6670ced3a2bSJed Brown 6680ced3a2bSJed Brown /* put together the new symbolic matrix */ 6699566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 6709566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 6710ced3a2bSJed Brown 6720ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6730ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 6744222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 6750ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6760ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 6770ced3a2bSJed Brown c->nonew = 0; 67826fbe8dcSKarl Rupp 6794222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 6800ced3a2bSJed Brown 6810ced3a2bSJed Brown /* set MatInfo */ 6820ced3a2bSJed Brown afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 6830ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6844222ddf1SHong Zhang C->info.mallocs = ndouble; 6854222ddf1SHong Zhang C->info.fill_ratio_given = fill; 6864222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 6870ced3a2bSJed Brown 6880ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 6890ced3a2bSJed Brown if (ci[am]) { 6909566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 6919566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 6920ced3a2bSJed Brown } else { 6939566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 6940ced3a2bSJed Brown } 6950ced3a2bSJed Brown #endif 6963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6970ced3a2bSJed Brown } 698e9e4536cSHong Zhang 699d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 700d71ae5a4SJacob Faibussowitsch { 7018a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 7028a07c6f1SJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 7038a07c6f1SJed Brown PetscInt *ci, *cj, *bb; 7048a07c6f1SJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 7058a07c6f1SJed Brown PetscReal afill; 7068a07c6f1SJed Brown PetscInt i, j, col, ndouble = 0; 7070298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 7088a07c6f1SJed Brown PetscHeap h; 7098a07c6f1SJed Brown PetscBT bt; 7108a07c6f1SJed Brown 7118a07c6f1SJed Brown PetscFunctionBegin; 712cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 7138a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 7149566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 7158a07c6f1SJed Brown ci[0] = 0; 7168a07c6f1SJed Brown 7178a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 7189566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 7192205254eSKarl Rupp 7208a07c6f1SJed Brown current_space = free_space; 7218a07c6f1SJed Brown 7229566063dSJacob Faibussowitsch PetscCall(PetscHeapCreate(a->rmax, &h)); 7239566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->rmax, &bb)); 7249566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(bn, &bt)); 7258a07c6f1SJed Brown 7268a07c6f1SJed Brown /* Determine ci and cj */ 7278a07c6f1SJed Brown for (i = 0; i < am; i++) { 7288a07c6f1SJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 7298a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 7308a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 7318a07c6f1SJed Brown ci[i + 1] = ci[i]; 7328a07c6f1SJed Brown /* Populate the min heap */ 7338a07c6f1SJed Brown for (j = 0; j < anzi; j++) { 7348a07c6f1SJed Brown PetscInt brow = acol[j]; 7358a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 7368a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7378a07c6f1SJed Brown if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 7389566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bcol)); 7398a07c6f1SJed Brown bb[j]++; 7408a07c6f1SJed Brown break; 7418a07c6f1SJed Brown } 7428a07c6f1SJed Brown } 7438a07c6f1SJed Brown } 7448a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 7459566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 7468a07c6f1SJed Brown while (j >= 0) { 7478a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 7480298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 7499566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 7508a07c6f1SJed Brown ndouble++; 7518a07c6f1SJed Brown } 7528a07c6f1SJed Brown *(current_space->array++) = col; 7538a07c6f1SJed Brown current_space->local_used++; 7548a07c6f1SJed Brown current_space->local_remaining--; 7558a07c6f1SJed Brown ci[i + 1]++; 7568a07c6f1SJed Brown 7578a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 7588a07c6f1SJed Brown for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 7598a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7608a07c6f1SJed Brown if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 7619566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bcol)); 7628a07c6f1SJed Brown bb[j]++; 7638a07c6f1SJed Brown break; 7648a07c6f1SJed Brown } 7658a07c6f1SJed Brown } 7669566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 7678a07c6f1SJed Brown } 7688a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7699566063dSJacob Faibussowitsch for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 7708a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7719566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(bn, bt)); 7728a07c6f1SJed Brown } 7738a07c6f1SJed Brown } 7749566063dSJacob Faibussowitsch PetscCall(PetscFree(bb)); 7759566063dSJacob Faibussowitsch PetscCall(PetscHeapDestroy(&h)); 7769566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&bt)); 7778a07c6f1SJed Brown 7788a07c6f1SJed Brown /* Column indices are in the list of free space */ 7798a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7808a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 7819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am], &cj)); 7829566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 7838a07c6f1SJed Brown 7848a07c6f1SJed Brown /* put together the new symbolic matrix */ 7859566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 7869566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 7878a07c6f1SJed Brown 7888a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 7898a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 7904222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 7918a07c6f1SJed Brown c->free_a = PETSC_TRUE; 7928a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 7938a07c6f1SJed Brown c->nonew = 0; 79426fbe8dcSKarl Rupp 7954222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 7968a07c6f1SJed Brown 7978a07c6f1SJed Brown /* set MatInfo */ 7988a07c6f1SJed Brown afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 7998a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 8004222ddf1SHong Zhang C->info.mallocs = ndouble; 8014222ddf1SHong Zhang C->info.fill_ratio_given = fill; 8024222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 8038a07c6f1SJed Brown 8048a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 8058a07c6f1SJed Brown if (ci[am]) { 8069566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 8079566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 8088a07c6f1SJed Brown } else { 8099566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 8108a07c6f1SJed Brown } 8118a07c6f1SJed Brown #endif 8123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 8138a07c6f1SJed Brown } 8148a07c6f1SJed Brown 815d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 816d71ae5a4SJacob Faibussowitsch { 817d7ed1a4dSandi selinger Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 818d7ed1a4dSandi selinger const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 819d7ed1a4dSandi selinger PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 820d7ed1a4dSandi selinger PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 821d7ed1a4dSandi selinger const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 822d7ed1a4dSandi selinger const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 823d7ed1a4dSandi selinger const PetscInt *brow_ptr[8], *brow_end[8]; 824d7ed1a4dSandi selinger PetscInt window[8]; 825d7ed1a4dSandi selinger PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 826d7ed1a4dSandi selinger PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 827d7ed1a4dSandi selinger PetscReal afill; 828f83700f2Sandi selinger PetscInt *workj_L1, *workj_L2, *workj_L3; 8297660c4dbSandi selinger PetscInt L1_nnz, L2_nnz; 830d7ed1a4dSandi selinger 831d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 832d7ed1a4dSandi selinger Because of the way virtual memory works, 833d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 834d7ed1a4dSandi selinger PetscFunctionBegin; 8359566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ci)); 836d7ed1a4dSandi selinger for (i = 0; i < am; i++) { 837d7ed1a4dSandi selinger const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 838d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 839d7ed1a4dSandi selinger a_rownnz = 0; 840d7ed1a4dSandi selinger for (k = 0; k < anzi; ++k) { 841d7ed1a4dSandi selinger a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 842d7ed1a4dSandi selinger if (a_rownnz > bn) { 843d7ed1a4dSandi selinger a_rownnz = bn; 844d7ed1a4dSandi selinger break; 845d7ed1a4dSandi selinger } 846d7ed1a4dSandi selinger } 847d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 848d7ed1a4dSandi selinger } 849d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 8509566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 8519566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 8529566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 853d7ed1a4dSandi selinger 8547660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 8557660c4dbSandi selinger c_maxmem = 8 * (ai[am] + bi[bm]); 856d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 8579566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c_maxmem, &cj)); 858d7ed1a4dSandi selinger 859d7ed1a4dSandi selinger ci_nnz = 0; 860d7ed1a4dSandi selinger ci[0] = 0; 861d7ed1a4dSandi selinger worki_L1[0] = 0; 862d7ed1a4dSandi selinger worki_L2[0] = 0; 863d7ed1a4dSandi selinger for (i = 0; i < am; i++) { 864d7ed1a4dSandi selinger const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 865d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 866d7ed1a4dSandi selinger rowsleft = anzi; 867d7ed1a4dSandi selinger inputcol_L1 = acol; 868d7ed1a4dSandi selinger L2_nnz = 0; 8697660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8707660c4dbSandi selinger worki_L2[1] = 0; 87108fe1b3cSKarl Rupp outputi_nnz = 0; 872d7ed1a4dSandi selinger 873d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 874d7ed1a4dSandi selinger while (ci_nnz + a_maxrownnz > c_maxmem) { 875d7ed1a4dSandi selinger c_maxmem *= 2; 876d7ed1a4dSandi selinger ndouble++; 8779566063dSJacob Faibussowitsch PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 878d7ed1a4dSandi selinger } 879d7ed1a4dSandi selinger 880d7ed1a4dSandi selinger while (rowsleft) { 881d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 882d7ed1a4dSandi selinger L1_nrows = 0; 8837660c4dbSandi selinger L1_nnz = 0; 884d7ed1a4dSandi selinger inputcol = inputcol_L1; 885d7ed1a4dSandi selinger inputi = bi; 886d7ed1a4dSandi selinger inputj = bj; 887d7ed1a4dSandi selinger 888d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 889d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 890f83700f2Sandi selinger Input: inputj inputi inputcol bn 891d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 892d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 893d7ed1a4dSandi selinger window_min = bn; \ 8947660c4dbSandi selinger outputi_nnz = 0; \ 8957660c4dbSandi selinger for (k = 0; k < ANNZ; ++k) { \ 896d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 897d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 898d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 899d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 900d7ed1a4dSandi selinger } \ 901d7ed1a4dSandi selinger while (window_min < bn) { \ 902d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 903d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 904d7ed1a4dSandi selinger old_window_min = window_min; \ 905d7ed1a4dSandi selinger window_min = bn; \ 906d7ed1a4dSandi selinger for (k = 0; k < ANNZ; ++k) { \ 907d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 908d7ed1a4dSandi selinger brow_ptr[k]++; \ 909d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 910d7ed1a4dSandi selinger } \ 911d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 912d7ed1a4dSandi selinger } \ 913d7ed1a4dSandi selinger } 914d7ed1a4dSandi selinger 915d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 916d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 917d7ed1a4dSandi selinger while (L1_rowsleft) { 9187660c4dbSandi selinger outputi_nnz = 0; 9197660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 9207660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 9217660c4dbSandi selinger 922d7ed1a4dSandi selinger switch (L1_rowsleft) { 9239371c9d4SSatish Balay case 1: 9249371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 925d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 926d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 927d7ed1a4dSandi selinger inputcol += L1_rowsleft; 928d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 929d7ed1a4dSandi selinger L1_rowsleft = 0; 930d7ed1a4dSandi selinger break; 9319371c9d4SSatish Balay case 2: 9329371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(2); 933d7ed1a4dSandi selinger inputcol += L1_rowsleft; 934d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 935d7ed1a4dSandi selinger L1_rowsleft = 0; 936d7ed1a4dSandi selinger break; 9379371c9d4SSatish Balay case 3: 9389371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(3); 939d7ed1a4dSandi selinger inputcol += L1_rowsleft; 940d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 941d7ed1a4dSandi selinger L1_rowsleft = 0; 942d7ed1a4dSandi selinger break; 9439371c9d4SSatish Balay case 4: 9449371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(4); 945d7ed1a4dSandi selinger inputcol += L1_rowsleft; 946d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 947d7ed1a4dSandi selinger L1_rowsleft = 0; 948d7ed1a4dSandi selinger break; 9499371c9d4SSatish Balay case 5: 9509371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(5); 951d7ed1a4dSandi selinger inputcol += L1_rowsleft; 952d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 953d7ed1a4dSandi selinger L1_rowsleft = 0; 954d7ed1a4dSandi selinger break; 9559371c9d4SSatish Balay case 6: 9569371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(6); 957d7ed1a4dSandi selinger inputcol += L1_rowsleft; 958d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 959d7ed1a4dSandi selinger L1_rowsleft = 0; 960d7ed1a4dSandi selinger break; 9619371c9d4SSatish Balay case 7: 9629371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(7); 963d7ed1a4dSandi selinger inputcol += L1_rowsleft; 964d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 965d7ed1a4dSandi selinger L1_rowsleft = 0; 966d7ed1a4dSandi selinger break; 9679371c9d4SSatish Balay default: 9689371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(8); 969d7ed1a4dSandi selinger inputcol += 8; 970d7ed1a4dSandi selinger rowsleft -= 8; 971d7ed1a4dSandi selinger L1_rowsleft -= 8; 972d7ed1a4dSandi selinger break; 973d7ed1a4dSandi selinger } 974d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9757660c4dbSandi selinger L1_nnz += outputi_nnz; 9767660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 977d7ed1a4dSandi selinger } 978d7ed1a4dSandi selinger 979d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 980d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 981d7ed1a4dSandi selinger if (anzi > 8) { 982d7ed1a4dSandi selinger inputi = worki_L1; 983d7ed1a4dSandi selinger inputj = workj_L1; 984d7ed1a4dSandi selinger inputcol = workcol; 985d7ed1a4dSandi selinger outputi_nnz = 0; 986d7ed1a4dSandi selinger 987d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 988d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 989d7ed1a4dSandi selinger 990d7ed1a4dSandi selinger switch (L1_nrows) { 9919371c9d4SSatish Balay case 1: 9929371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 993d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 994d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 995d7ed1a4dSandi selinger break; 996d71ae5a4SJacob Faibussowitsch case 2: 997d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(2); 998d71ae5a4SJacob Faibussowitsch break; 999d71ae5a4SJacob Faibussowitsch case 3: 1000d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(3); 1001d71ae5a4SJacob Faibussowitsch break; 1002d71ae5a4SJacob Faibussowitsch case 4: 1003d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(4); 1004d71ae5a4SJacob Faibussowitsch break; 1005d71ae5a4SJacob Faibussowitsch case 5: 1006d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(5); 1007d71ae5a4SJacob Faibussowitsch break; 1008d71ae5a4SJacob Faibussowitsch case 6: 1009d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(6); 1010d71ae5a4SJacob Faibussowitsch break; 1011d71ae5a4SJacob Faibussowitsch case 7: 1012d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(7); 1013d71ae5a4SJacob Faibussowitsch break; 1014d71ae5a4SJacob Faibussowitsch case 8: 1015d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(8); 1016d71ae5a4SJacob Faibussowitsch break; 1017d71ae5a4SJacob Faibussowitsch default: 1018d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1019d7ed1a4dSandi selinger } 1020d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 1021d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 1022d7ed1a4dSandi selinger 10237660c4dbSandi selinger /************************ L E V E L 3 **********************/ 10247660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 1025d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1026d7ed1a4dSandi selinger inputi = worki_L2; 1027d7ed1a4dSandi selinger inputj = workj_L2; 1028d7ed1a4dSandi selinger inputcol = workcol; 1029d7ed1a4dSandi selinger outputi_nnz = 0; 1030f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 1031d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 1032d7ed1a4dSandi selinger switch (L2_nrows) { 10339371c9d4SSatish Balay case 1: 10349371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 1035d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1036d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1037d7ed1a4dSandi selinger break; 1038d71ae5a4SJacob Faibussowitsch case 2: 1039d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(2); 1040d71ae5a4SJacob Faibussowitsch break; 1041d71ae5a4SJacob Faibussowitsch case 3: 1042d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(3); 1043d71ae5a4SJacob Faibussowitsch break; 1044d71ae5a4SJacob Faibussowitsch case 4: 1045d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(4); 1046d71ae5a4SJacob Faibussowitsch break; 1047d71ae5a4SJacob Faibussowitsch case 5: 1048d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(5); 1049d71ae5a4SJacob Faibussowitsch break; 1050d71ae5a4SJacob Faibussowitsch case 6: 1051d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(6); 1052d71ae5a4SJacob Faibussowitsch break; 1053d71ae5a4SJacob Faibussowitsch case 7: 1054d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(7); 1055d71ae5a4SJacob Faibussowitsch break; 1056d71ae5a4SJacob Faibussowitsch case 8: 1057d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(8); 1058d71ae5a4SJacob Faibussowitsch break; 1059d71ae5a4SJacob Faibussowitsch default: 1060d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1061d7ed1a4dSandi selinger } 1062d7ed1a4dSandi selinger L2_nrows = 1; 10637660c4dbSandi selinger L2_nnz = outputi_nnz; 10647660c4dbSandi selinger worki_L2[1] = outputi_nnz; 10657660c4dbSandi selinger /* Copy to workj_L2 */ 1066d7ed1a4dSandi selinger if (rowsleft) { 10677660c4dbSandi selinger for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1068d7ed1a4dSandi selinger } 1069d7ed1a4dSandi selinger } 1070d7ed1a4dSandi selinger } 1071d7ed1a4dSandi selinger } /* while (rowsleft) */ 1072d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 1073d7ed1a4dSandi selinger 1074d7ed1a4dSandi selinger /* terminate current row */ 1075d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 1076d7ed1a4dSandi selinger ci[i + 1] = ci_nnz; 1077d7ed1a4dSandi selinger } 1078d7ed1a4dSandi selinger 1079d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 10809566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 10819566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1082d7ed1a4dSandi selinger 1083d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1084d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 10854222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 1086d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 1087d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 1088d7ed1a4dSandi selinger c->nonew = 0; 1089d7ed1a4dSandi selinger 10904222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1091d7ed1a4dSandi selinger 1092d7ed1a4dSandi selinger /* set MatInfo */ 1093d7ed1a4dSandi selinger afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1094d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 10954222ddf1SHong Zhang C->info.mallocs = ndouble; 10964222ddf1SHong Zhang C->info.fill_ratio_given = fill; 10974222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1098d7ed1a4dSandi selinger 1099d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1100d7ed1a4dSandi selinger if (ci[am]) { 11019566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 11029566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1103d7ed1a4dSandi selinger } else { 11049566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 1105d7ed1a4dSandi selinger } 1106d7ed1a4dSandi selinger #endif 1107d7ed1a4dSandi selinger 1108d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 11099566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L1)); 11109566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L2)); 11119566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L3)); 11123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1113d7ed1a4dSandi selinger } 1114d7ed1a4dSandi selinger 1115cd093f1dSJed Brown /* concatenate unique entries and then sort */ 1116d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1117d71ae5a4SJacob Faibussowitsch { 1118cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1119cd093f1dSJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 11208c78258cSHong Zhang PetscInt *ci, *cj, bcol; 1121cd093f1dSJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1122cd093f1dSJed Brown PetscReal afill; 1123cd093f1dSJed Brown PetscInt i, j, ndouble = 0; 1124cd093f1dSJed Brown PetscSegBuffer seg, segrow; 1125cd093f1dSJed Brown char *seen; 1126cd093f1dSJed Brown 1127cd093f1dSJed Brown PetscFunctionBegin; 11289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ci)); 1129cd093f1dSJed Brown ci[0] = 0; 1130cd093f1dSJed Brown 1131cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 11329566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 11339566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 11349566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(bn, &seen)); 1135cd093f1dSJed Brown 1136cd093f1dSJed Brown /* Determine ci and cj */ 1137cd093f1dSJed Brown for (i = 0; i < am; i++) { 1138cd093f1dSJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1139cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1140cd093f1dSJed Brown PetscInt packlen = 0, *PETSC_RESTRICT crow; 11418c78258cSHong Zhang 1142cd093f1dSJed Brown /* Pack segrow */ 1143cd093f1dSJed Brown for (j = 0; j < anzi; j++) { 1144cd093f1dSJed Brown PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1145cd093f1dSJed Brown for (k = bjstart; k < bjend; k++) { 11468c78258cSHong Zhang bcol = bj[k]; 1147cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1148cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 11499566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1150cd093f1dSJed Brown *slot = bcol; 1151cd093f1dSJed Brown seen[bcol] = 1; 1152cd093f1dSJed Brown packlen++; 1153cd093f1dSJed Brown } 1154cd093f1dSJed Brown } 1155cd093f1dSJed Brown } 11568c78258cSHong Zhang 11578c78258cSHong Zhang /* Check i-th diagonal entry */ 11588c78258cSHong Zhang if (C->force_diagonals && !seen[i]) { 11598c78258cSHong Zhang PetscInt *PETSC_RESTRICT slot; 11609566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 11618c78258cSHong Zhang *slot = i; 11628c78258cSHong Zhang seen[i] = 1; 11638c78258cSHong Zhang packlen++; 11648c78258cSHong Zhang } 11658c78258cSHong Zhang 11669566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 11679566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractTo(segrow, crow)); 11689566063dSJacob Faibussowitsch PetscCall(PetscSortInt(packlen, crow)); 1169cd093f1dSJed Brown ci[i + 1] = ci[i] + packlen; 1170cd093f1dSJed Brown for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1171cd093f1dSJed Brown } 11729566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segrow)); 11739566063dSJacob Faibussowitsch PetscCall(PetscFree(seen)); 1174cd093f1dSJed Brown 1175cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 11769566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 11779566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&seg)); 1178cd093f1dSJed Brown 1179cd093f1dSJed Brown /* put together the new symbolic matrix */ 11809566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 11819566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1182cd093f1dSJed Brown 1183cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1184cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 11854222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 1186cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1187cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1188cd093f1dSJed Brown c->nonew = 0; 1189cd093f1dSJed Brown 11904222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1191cd093f1dSJed Brown 1192cd093f1dSJed Brown /* set MatInfo */ 11932a09556fSStefano Zampini afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1194cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 11954222ddf1SHong Zhang C->info.mallocs = ndouble; 11964222ddf1SHong Zhang C->info.fill_ratio_given = fill; 11974222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1198cd093f1dSJed Brown 1199cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1200cd093f1dSJed Brown if (ci[am]) { 12019566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 12029566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1203cd093f1dSJed Brown } else { 12049566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 1205cd093f1dSJed Brown } 1206cd093f1dSJed Brown #endif 12073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1208cd093f1dSJed Brown } 1209cd093f1dSJed Brown 1210d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1211d71ae5a4SJacob Faibussowitsch { 12126718818eSStefano Zampini Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 12132128a86cSHong Zhang 12142128a86cSHong Zhang PetscFunctionBegin; 12159566063dSJacob Faibussowitsch PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 12169566063dSJacob Faibussowitsch PetscCall(MatDestroy(&abt->Bt_den)); 12179566063dSJacob Faibussowitsch PetscCall(MatDestroy(&abt->ABt_den)); 12189566063dSJacob Faibussowitsch PetscCall(PetscFree(abt)); 12193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12202128a86cSHong Zhang } 12212128a86cSHong Zhang 1222d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1223d71ae5a4SJacob Faibussowitsch { 122481d82fe4SHong Zhang Mat Bt; 122540192850SHong Zhang Mat_MatMatTransMult *abt; 12264222ddf1SHong Zhang Mat_Product *product = C->product; 12276718818eSStefano Zampini char *alg; 1228d2853540SHong Zhang 122981d82fe4SHong Zhang PetscFunctionBegin; 123028b400f6SJacob Faibussowitsch PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 123128b400f6SJacob Faibussowitsch PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 12326718818eSStefano Zampini 123381d82fe4SHong Zhang /* create symbolic Bt */ 12347fb60732SBarry Smith PetscCall(MatTransposeSymbolic(B, &Bt)); 12359566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 12369566063dSJacob Faibussowitsch PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 123781d82fe4SHong Zhang 123881d82fe4SHong Zhang /* get symbolic C=A*Bt */ 12399566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(product->alg, &alg)); 12409566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 12419566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 12429566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 12439566063dSJacob Faibussowitsch PetscCall(PetscFree(alg)); 124481d82fe4SHong Zhang 1245a5b23f4aSJose E. Roman /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 12469566063dSJacob Faibussowitsch PetscCall(PetscNew(&abt)); 12472128a86cSHong Zhang 12486718818eSStefano Zampini product->data = abt; 12496718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 12506718818eSStefano Zampini 12514222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 12522128a86cSHong Zhang 12534222ddf1SHong Zhang abt->usecoloring = PETSC_FALSE; 12549566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 125540192850SHong Zhang if (abt->usecoloring) { 1256b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1257b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1258335efc43SPeter Brune MatColoring coloring; 12592128a86cSHong Zhang ISColoring iscoloring; 12602128a86cSHong Zhang Mat Bt_dense, C_dense; 1261e8354b3eSHong Zhang 12624222ddf1SHong Zhang /* inode causes memory problem */ 12639566063dSJacob Faibussowitsch PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 12644222ddf1SHong Zhang 12659566063dSJacob Faibussowitsch PetscCall(MatColoringCreate(C, &coloring)); 12669566063dSJacob Faibussowitsch PetscCall(MatColoringSetDistance(coloring, 2)); 12679566063dSJacob Faibussowitsch PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 12689566063dSJacob Faibussowitsch PetscCall(MatColoringSetFromOptions(coloring)); 12699566063dSJacob Faibussowitsch PetscCall(MatColoringApply(coloring, &iscoloring)); 12709566063dSJacob Faibussowitsch PetscCall(MatColoringDestroy(&coloring)); 12719566063dSJacob Faibussowitsch PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 12722205254eSKarl Rupp 127340192850SHong Zhang abt->matcoloring = matcoloring; 12742205254eSKarl Rupp 12759566063dSJacob Faibussowitsch PetscCall(ISColoringDestroy(&iscoloring)); 12762128a86cSHong Zhang 12772128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 12789566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 12799566063dSJacob Faibussowitsch PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 12809566063dSJacob Faibussowitsch PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 12819566063dSJacob Faibussowitsch PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 12822205254eSKarl Rupp 12832128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 128440192850SHong Zhang abt->Bt_den = Bt_dense; 12852128a86cSHong Zhang 12869566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 12879566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 12889566063dSJacob Faibussowitsch PetscCall(MatSetType(C_dense, MATSEQDENSE)); 12899566063dSJacob Faibussowitsch PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 12902205254eSKarl Rupp 12912128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 129240192850SHong Zhang abt->ABt_den = C_dense; 1293f94ccd6cSHong Zhang 1294f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 12951ce71dffSSatish Balay { 12964222ddf1SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 12979371c9d4SSatish Balay PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 12989371c9d4SSatish Balay Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 12991ce71dffSSatish Balay } 1300f94ccd6cSHong Zhang #endif 13012128a86cSHong Zhang } 1302e99be685SHong Zhang /* clean up */ 13039566063dSJacob Faibussowitsch PetscCall(MatDestroy(&Bt)); 13043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13055df89d91SHong Zhang } 13065df89d91SHong Zhang 1307d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1308d71ae5a4SJacob Faibussowitsch { 13095df89d91SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1310e2cac8caSJed Brown PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 131181d82fe4SHong Zhang PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 13125df89d91SHong Zhang PetscLogDouble flops = 0.0; 1313aa1aec99SHong Zhang MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 13146718818eSStefano Zampini Mat_MatMatTransMult *abt; 13156718818eSStefano Zampini Mat_Product *product = C->product; 13165df89d91SHong Zhang 13175df89d91SHong Zhang PetscFunctionBegin; 131828b400f6SJacob Faibussowitsch PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 13196718818eSStefano Zampini abt = (Mat_MatMatTransMult *)product->data; 132028b400f6SJacob Faibussowitsch PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 132158ed3ceaSHong Zhang /* clear old values in C */ 132258ed3ceaSHong Zhang if (!c->a) { 13239566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 132458ed3ceaSHong Zhang c->a = ca; 132558ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 132658ed3ceaSHong Zhang } else { 132758ed3ceaSHong Zhang ca = c->a; 13289566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 132958ed3ceaSHong Zhang } 133058ed3ceaSHong Zhang 133140192850SHong Zhang if (abt->usecoloring) { 133240192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 133340192850SHong Zhang Mat Bt_dense, C_dense = abt->ABt_den; 1334c8db22f6SHong Zhang 1335b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 133640192850SHong Zhang Bt_dense = abt->Bt_den; 13379566063dSJacob Faibussowitsch PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1338c8db22f6SHong Zhang 1339c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 13409566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1341c8db22f6SHong Zhang 13422128a86cSHong Zhang /* Recover C from C_dense */ 13439566063dSJacob Faibussowitsch PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 13443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1345c8db22f6SHong Zhang } 1346c8db22f6SHong Zhang 13471fa1209cSHong Zhang for (i = 0; i < cm; i++) { 134881d82fe4SHong Zhang anzi = ai[i + 1] - ai[i]; 134981d82fe4SHong Zhang acol = aj + ai[i]; 13506973769bSHong Zhang aval = aa + ai[i]; 13511fa1209cSHong Zhang cnzi = ci[i + 1] - ci[i]; 13521fa1209cSHong Zhang ccol = cj + ci[i]; 13536973769bSHong Zhang cval = ca + ci[i]; 13541fa1209cSHong Zhang for (j = 0; j < cnzi; j++) { 135581d82fe4SHong Zhang brow = ccol[j]; 135681d82fe4SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 135781d82fe4SHong Zhang bcol = bj + bi[brow]; 13586973769bSHong Zhang bval = ba + bi[brow]; 13596973769bSHong Zhang 136081d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 13619371c9d4SSatish Balay nexta = 0; 13629371c9d4SSatish Balay nextb = 0; 136381d82fe4SHong Zhang while (nexta < anzi && nextb < bnzj) { 13647b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 136581d82fe4SHong Zhang if (nexta == anzi) break; 13667b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 136781d82fe4SHong Zhang if (nextb == bnzj) break; 136881d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 13696973769bSHong Zhang cval[j] += aval[nexta] * bval[nextb]; 13709371c9d4SSatish Balay nexta++; 13719371c9d4SSatish Balay nextb++; 137281d82fe4SHong Zhang flops += 2; 13731fa1209cSHong Zhang } 13741fa1209cSHong Zhang } 137581d82fe4SHong Zhang } 137681d82fe4SHong Zhang } 13779566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 13789566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 13799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 13803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13815df89d91SHong Zhang } 13825df89d91SHong Zhang 1383d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1384d71ae5a4SJacob Faibussowitsch { 13856718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 13866d373c3eSHong Zhang 13876d373c3eSHong Zhang PetscFunctionBegin; 13889566063dSJacob Faibussowitsch PetscCall(MatDestroy(&atb->At)); 13891baa6e33SBarry Smith if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 13909566063dSJacob Faibussowitsch PetscCall(PetscFree(atb)); 13913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13926d373c3eSHong Zhang } 13936d373c3eSHong Zhang 1394d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1395d71ae5a4SJacob Faibussowitsch { 1396089a957eSStefano Zampini Mat At = NULL; 13974222ddf1SHong Zhang Mat_Product *product = C->product; 1398089a957eSStefano Zampini PetscBool flg, def, square; 1399bc011b1eSHong Zhang 1400bc011b1eSHong Zhang PetscFunctionBegin; 1401089a957eSStefano Zampini MatCheckProduct(C, 4); 1402b94d7dedSBarry Smith square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 14034222ddf1SHong Zhang /* outerproduct */ 14049566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 14054222ddf1SHong Zhang if (flg) { 1406bc011b1eSHong Zhang /* create symbolic At */ 1407089a957eSStefano Zampini if (!square) { 14087fb60732SBarry Smith PetscCall(MatTransposeSymbolic(A, &At)); 14099566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 14109566063dSJacob Faibussowitsch PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1411089a957eSStefano Zampini } 1412bc011b1eSHong Zhang /* get symbolic C=At*B */ 14139566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); 14149566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1415bc011b1eSHong Zhang 1416bc011b1eSHong Zhang /* clean up */ 141748a46eb9SPierre Jolivet if (!square) PetscCall(MatDestroy(&At)); 14186d373c3eSHong Zhang 14194222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 14209566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 14213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14224222ddf1SHong Zhang } 14234222ddf1SHong Zhang 14244222ddf1SHong Zhang /* matmatmult */ 14259566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &def)); 14269566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1427089a957eSStefano Zampini if (flg || def) { 14284222ddf1SHong Zhang Mat_MatTransMatMult *atb; 14294222ddf1SHong Zhang 143028b400f6SJacob Faibussowitsch PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 14319566063dSJacob Faibussowitsch PetscCall(PetscNew(&atb)); 143248a46eb9SPierre Jolivet if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 14339566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); 14349566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 14359566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "at*b")); 14366718818eSStefano Zampini product->data = atb; 14376718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 14384222ddf1SHong Zhang atb->At = At; 14394222ddf1SHong Zhang 14404222ddf1SHong Zhang C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 14413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14424222ddf1SHong Zhang } 14434222ddf1SHong Zhang 14444222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1445bc011b1eSHong Zhang } 1446bc011b1eSHong Zhang 1447d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1448d71ae5a4SJacob Faibussowitsch { 14490fbc74f4SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1450d0f46423SBarry Smith PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1451d0f46423SBarry Smith PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1452d13dce4bSSatish Balay PetscLogDouble flops = 0.0; 1453aa1aec99SHong Zhang MatScalar *aa = a->a, *ba, *ca, *caj; 1454bc011b1eSHong Zhang 1455bc011b1eSHong Zhang PetscFunctionBegin; 1456aa1aec99SHong Zhang if (!c->a) { 14579566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 14582205254eSKarl Rupp 1459aa1aec99SHong Zhang c->a = ca; 1460aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1461aa1aec99SHong Zhang } else { 1462aa1aec99SHong Zhang ca = c->a; 14639566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 1464aa1aec99SHong Zhang } 1465bc011b1eSHong Zhang 1466bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1467bc011b1eSHong Zhang for (i = 0; i < am; i++) { 1468bc011b1eSHong Zhang bj = b->j + bi[i]; 1469bc011b1eSHong Zhang ba = b->a + bi[i]; 1470bc011b1eSHong Zhang bnzi = bi[i + 1] - bi[i]; 1471bc011b1eSHong Zhang anzi = ai[i + 1] - ai[i]; 1472bc011b1eSHong Zhang for (j = 0; j < anzi; j++) { 1473bc011b1eSHong Zhang nextb = 0; 14740fbc74f4SHong Zhang crow = *aj++; 1475bc011b1eSHong Zhang cjj = cj + ci[crow]; 1476bc011b1eSHong Zhang caj = ca + ci[crow]; 1477bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1478bc011b1eSHong Zhang for (k = 0; nextb < bnzi; k++) { 14790fbc74f4SHong Zhang if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 14800fbc74f4SHong Zhang caj[k] += (*aa) * (*(ba + nextb)); 1481bc011b1eSHong Zhang nextb++; 1482bc011b1eSHong Zhang } 1483bc011b1eSHong Zhang } 1484bc011b1eSHong Zhang flops += 2 * bnzi; 14850fbc74f4SHong Zhang aa++; 1486bc011b1eSHong Zhang } 1487bc011b1eSHong Zhang } 1488bc011b1eSHong Zhang 1489bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 14909566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 14919566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 14929566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 14933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1494bc011b1eSHong Zhang } 14959123193aSHong Zhang 1496d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1497d71ae5a4SJacob Faibussowitsch { 14989123193aSHong Zhang PetscFunctionBegin; 14999566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15004222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 15013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 15029123193aSHong Zhang } 15039123193aSHong Zhang 1504d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1505d71ae5a4SJacob Faibussowitsch { 1506f32d5d43SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 15071ca9667aSStefano Zampini PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1508a4af7ca8SStefano Zampini const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1509daf57211SHong Zhang const PetscInt *aj; 151075f6d85dSStefano Zampini PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 151175f6d85dSStefano Zampini PetscInt clda; 151275f6d85dSStefano Zampini PetscInt am4, bm4, col, i, j, n; 15139123193aSHong Zhang 15149123193aSHong Zhang PetscFunctionBegin; 15153ba16761SJacob Faibussowitsch if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 15169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &av)); 151793aa15f2SStefano Zampini if (add) { 15189566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 151993aa15f2SStefano Zampini } else { 15209566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayWrite(C, &c)); 152193aa15f2SStefano Zampini } 15229566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayRead(B, &b)); 15239566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &bm)); 15249566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 152575f6d85dSStefano Zampini am4 = 4 * clda; 152675f6d85dSStefano Zampini bm4 = 4 * bm; 15279371c9d4SSatish Balay b1 = b; 15289371c9d4SSatish Balay b2 = b1 + bm; 15299371c9d4SSatish Balay b3 = b2 + bm; 15309371c9d4SSatish Balay b4 = b3 + bm; 15319371c9d4SSatish Balay c1 = c; 15329371c9d4SSatish Balay c2 = c1 + clda; 15339371c9d4SSatish Balay c3 = c2 + clda; 15349371c9d4SSatish Balay c4 = c3 + clda; 15351ca9667aSStefano Zampini for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 15361ca9667aSStefano Zampini for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1537f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1538f32d5d43SBarry Smith n = a->i[i + 1] - a->i[i]; 1539f32d5d43SBarry Smith aj = a->j + a->i[i]; 1540a4af7ca8SStefano Zampini aa = av + a->i[i]; 1541f32d5d43SBarry Smith for (j = 0; j < n; j++) { 15421ca9667aSStefano Zampini const PetscScalar aatmp = aa[j]; 15431ca9667aSStefano Zampini const PetscInt ajtmp = aj[j]; 1544730858b9SHong Zhang r1 += aatmp * b1[ajtmp]; 1545730858b9SHong Zhang r2 += aatmp * b2[ajtmp]; 1546730858b9SHong Zhang r3 += aatmp * b3[ajtmp]; 1547730858b9SHong Zhang r4 += aatmp * b4[ajtmp]; 15489123193aSHong Zhang } 154993aa15f2SStefano Zampini if (add) { 155087753ddeSHong Zhang c1[i] += r1; 155187753ddeSHong Zhang c2[i] += r2; 155287753ddeSHong Zhang c3[i] += r3; 155387753ddeSHong Zhang c4[i] += r4; 155493aa15f2SStefano Zampini } else { 155593aa15f2SStefano Zampini c1[i] = r1; 155693aa15f2SStefano Zampini c2[i] = r2; 155793aa15f2SStefano Zampini c3[i] = r3; 155893aa15f2SStefano Zampini c4[i] = r4; 155993aa15f2SStefano Zampini } 1560f32d5d43SBarry Smith } 15619371c9d4SSatish Balay b1 += bm4; 15629371c9d4SSatish Balay b2 += bm4; 15639371c9d4SSatish Balay b3 += bm4; 15649371c9d4SSatish Balay b4 += bm4; 15659371c9d4SSatish Balay c1 += am4; 15669371c9d4SSatish Balay c2 += am4; 15679371c9d4SSatish Balay c3 += am4; 15689371c9d4SSatish Balay c4 += am4; 1569f32d5d43SBarry Smith } 157093aa15f2SStefano Zampini /* process remaining columns */ 157193aa15f2SStefano Zampini if (col != cn) { 157293aa15f2SStefano Zampini PetscInt rc = cn - col; 157393aa15f2SStefano Zampini 157493aa15f2SStefano Zampini if (rc == 1) { 157593aa15f2SStefano Zampini for (i = 0; i < am; i++) { 1576f32d5d43SBarry Smith r1 = 0.0; 1577f32d5d43SBarry Smith n = a->i[i + 1] - a->i[i]; 1578f32d5d43SBarry Smith aj = a->j + a->i[i]; 1579a4af7ca8SStefano Zampini aa = av + a->i[i]; 158093aa15f2SStefano Zampini for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 158193aa15f2SStefano Zampini if (add) c1[i] += r1; 158293aa15f2SStefano Zampini else c1[i] = r1; 158393aa15f2SStefano Zampini } 158493aa15f2SStefano Zampini } else if (rc == 2) { 158593aa15f2SStefano Zampini for (i = 0; i < am; i++) { 158693aa15f2SStefano Zampini r1 = r2 = 0.0; 158793aa15f2SStefano Zampini n = a->i[i + 1] - a->i[i]; 158893aa15f2SStefano Zampini aj = a->j + a->i[i]; 158993aa15f2SStefano Zampini aa = av + a->i[i]; 1590f32d5d43SBarry Smith for (j = 0; j < n; j++) { 159193aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 159293aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 159393aa15f2SStefano Zampini r1 += aatmp * b1[ajtmp]; 159493aa15f2SStefano Zampini r2 += aatmp * b2[ajtmp]; 1595f32d5d43SBarry Smith } 159693aa15f2SStefano Zampini if (add) { 159787753ddeSHong Zhang c1[i] += r1; 159893aa15f2SStefano Zampini c2[i] += r2; 159993aa15f2SStefano Zampini } else { 160093aa15f2SStefano Zampini c1[i] = r1; 160193aa15f2SStefano Zampini c2[i] = r2; 1602f32d5d43SBarry Smith } 160393aa15f2SStefano Zampini } 160493aa15f2SStefano Zampini } else { 160593aa15f2SStefano Zampini for (i = 0; i < am; i++) { 160693aa15f2SStefano Zampini r1 = r2 = r3 = 0.0; 160793aa15f2SStefano Zampini n = a->i[i + 1] - a->i[i]; 160893aa15f2SStefano Zampini aj = a->j + a->i[i]; 160993aa15f2SStefano Zampini aa = av + a->i[i]; 161093aa15f2SStefano Zampini for (j = 0; j < n; j++) { 161193aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 161293aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 161393aa15f2SStefano Zampini r1 += aatmp * b1[ajtmp]; 161493aa15f2SStefano Zampini r2 += aatmp * b2[ajtmp]; 161593aa15f2SStefano Zampini r3 += aatmp * b3[ajtmp]; 161693aa15f2SStefano Zampini } 161793aa15f2SStefano Zampini if (add) { 161893aa15f2SStefano Zampini c1[i] += r1; 161993aa15f2SStefano Zampini c2[i] += r2; 162093aa15f2SStefano Zampini c3[i] += r3; 162193aa15f2SStefano Zampini } else { 162293aa15f2SStefano Zampini c1[i] = r1; 162393aa15f2SStefano Zampini c2[i] = r2; 162493aa15f2SStefano Zampini c3[i] = r3; 162593aa15f2SStefano Zampini } 162693aa15f2SStefano Zampini } 162793aa15f2SStefano Zampini } 1628f32d5d43SBarry Smith } 16299566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 163093aa15f2SStefano Zampini if (add) { 16319566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 163293aa15f2SStefano Zampini } else { 16339566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayWrite(C, &c)); 163493aa15f2SStefano Zampini } 16359566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayRead(B, &b)); 16369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 16373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1638f32d5d43SBarry Smith } 1639f32d5d43SBarry Smith 1640d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1641d71ae5a4SJacob Faibussowitsch { 1642f32d5d43SBarry Smith PetscFunctionBegin; 164308401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 164408401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 164508401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 16464324174eSBarry Smith 16479566063dSJacob Faibussowitsch PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 16483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16499123193aSHong Zhang } 1650b1683b59SHong Zhang 1651d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1652d71ae5a4SJacob Faibussowitsch { 16534222ddf1SHong Zhang PetscFunctionBegin; 16544222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 16554222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16574222ddf1SHong Zhang } 16584222ddf1SHong Zhang 16596718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 16606718818eSStefano Zampini 1661d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1662d71ae5a4SJacob Faibussowitsch { 16634222ddf1SHong Zhang PetscFunctionBegin; 16646718818eSStefano Zampini C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16654222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB; 16663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16676718818eSStefano Zampini } 16686718818eSStefano Zampini 1669d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1670d71ae5a4SJacob Faibussowitsch { 16716718818eSStefano Zampini PetscFunctionBegin; 16726718818eSStefano Zampini C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16736718818eSStefano Zampini C->ops->productsymbolic = MatProductSymbolic_ABt; 16743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16754222ddf1SHong Zhang } 16764222ddf1SHong Zhang 1677d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1678d71ae5a4SJacob Faibussowitsch { 16794222ddf1SHong Zhang Mat_Product *product = C->product; 16804222ddf1SHong Zhang 16814222ddf1SHong Zhang PetscFunctionBegin; 16824222ddf1SHong Zhang switch (product->type) { 1683d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB: 1684d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1685d71ae5a4SJacob Faibussowitsch break; 1686d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB: 1687d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1688d71ae5a4SJacob Faibussowitsch break; 1689d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 1690d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1691d71ae5a4SJacob Faibussowitsch break; 1692d71ae5a4SJacob Faibussowitsch default: 1693d71ae5a4SJacob Faibussowitsch break; 16944222ddf1SHong Zhang } 16953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16964222ddf1SHong Zhang } 1697*2ef1f0ffSBarry Smith 1698d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1699d71ae5a4SJacob Faibussowitsch { 17004222ddf1SHong Zhang Mat_Product *product = C->product; 17014222ddf1SHong Zhang Mat A = product->A; 17024222ddf1SHong Zhang PetscBool baij; 17034222ddf1SHong Zhang 17044222ddf1SHong Zhang PetscFunctionBegin; 17059566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 17064222ddf1SHong Zhang if (!baij) { /* A is seqsbaij */ 17074222ddf1SHong Zhang PetscBool sbaij; 17089566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 170928b400f6SJacob Faibussowitsch PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 17104222ddf1SHong Zhang 17114222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 17124222ddf1SHong Zhang } else { /* A is seqbaij */ 17134222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 17144222ddf1SHong Zhang } 17154222ddf1SHong Zhang 17164222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17184222ddf1SHong Zhang } 17194222ddf1SHong Zhang 1720d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1721d71ae5a4SJacob Faibussowitsch { 17224222ddf1SHong Zhang Mat_Product *product = C->product; 17234222ddf1SHong Zhang 17244222ddf1SHong Zhang PetscFunctionBegin; 17256718818eSStefano Zampini MatCheckProduct(C, 1); 172628b400f6SJacob Faibussowitsch PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1727b94d7dedSBarry Smith if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 17283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17294222ddf1SHong Zhang } 17306718818eSStefano Zampini 1731d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1732d71ae5a4SJacob Faibussowitsch { 17334222ddf1SHong Zhang PetscFunctionBegin; 17344222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 17354222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17374222ddf1SHong Zhang } 17384222ddf1SHong Zhang 1739d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1740d71ae5a4SJacob Faibussowitsch { 17414222ddf1SHong Zhang Mat_Product *product = C->product; 17424222ddf1SHong Zhang 17434222ddf1SHong Zhang PetscFunctionBegin; 174448a46eb9SPierre Jolivet if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 17453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17464222ddf1SHong Zhang } 17474222ddf1SHong Zhang 1748d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1749d71ae5a4SJacob Faibussowitsch { 17502f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 17512f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 17522f5f1f90SHong Zhang PetscInt *bi = b->i, *bj = b->j; 17532f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 17542f5f1f90SHong Zhang MatScalar *btval, *btval_den, *ba = b->a; 17552f5f1f90SHong Zhang PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1756c8db22f6SHong Zhang 1757c8db22f6SHong Zhang PetscFunctionBegin; 17582f5f1f90SHong Zhang btval_den = btdense->v; 17599566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(btval_den, m * n)); 17602f5f1f90SHong Zhang for (k = 0; k < ncolors; k++) { 17612f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 17622f5f1f90SHong Zhang for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1763d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 17642f5f1f90SHong Zhang btcol = bj + bi[col]; 17652f5f1f90SHong Zhang btval = ba + bi[col]; 17662f5f1f90SHong Zhang anz = bi[col + 1] - bi[col]; 1767d2853540SHong Zhang for (j = 0; j < anz; j++) { 17682f5f1f90SHong Zhang brow = btcol[j]; 17692f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1770c8db22f6SHong Zhang } 1771c8db22f6SHong Zhang } 17722f5f1f90SHong Zhang btval_den += m; 1773c8db22f6SHong Zhang } 17743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1775c8db22f6SHong Zhang } 1776c8db22f6SHong Zhang 1777d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1778d71ae5a4SJacob Faibussowitsch { 1779b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 17801683a169SBarry Smith const PetscScalar *ca_den, *ca_den_ptr; 17811683a169SBarry Smith PetscScalar *ca = csp->a; 1782f99a636bSHong Zhang PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1783e88777f2SHong Zhang PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1784077f23c2SHong Zhang PetscInt nrows, *row, *idx; 1785077f23c2SHong Zhang PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 17868972f759SHong Zhang 17878972f759SHong Zhang PetscFunctionBegin; 17889566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1789f99a636bSHong Zhang 1790077f23c2SHong Zhang if (brows > 0) { 1791077f23c2SHong Zhang PetscInt *lstart, row_end, row_start; 1792980ae229SHong Zhang lstart = matcoloring->lstart; 17939566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(lstart, ncolors)); 1794eeb4fd02SHong Zhang 1795077f23c2SHong Zhang row_end = brows; 1796eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1797077f23c2SHong Zhang for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1798077f23c2SHong Zhang ca_den_ptr = ca_den; 1799980ae229SHong Zhang for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1800eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1801eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1802eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1803eeb4fd02SHong Zhang for (l = lstart[k]; l < nrows; l++) { 1804eeb4fd02SHong Zhang if (row[l] >= row_end) { 1805eeb4fd02SHong Zhang lstart[k] = l; 1806eeb4fd02SHong Zhang break; 1807eeb4fd02SHong Zhang } else { 1808077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1809eeb4fd02SHong Zhang } 1810eeb4fd02SHong Zhang } 1811077f23c2SHong Zhang ca_den_ptr += m; 1812eeb4fd02SHong Zhang } 1813077f23c2SHong Zhang row_end += brows; 1814eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1815eeb4fd02SHong Zhang } 1816077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1817077f23c2SHong Zhang ca_den_ptr = ca_den; 1818077f23c2SHong Zhang for (k = 0; k < ncolors; k++) { 1819077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1820077f23c2SHong Zhang row = rows + colorforrow[k]; 1821077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1822ad540459SPierre Jolivet for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1823077f23c2SHong Zhang ca_den_ptr += m; 1824077f23c2SHong Zhang } 1825f99a636bSHong Zhang } 1826f99a636bSHong Zhang 18279566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1828f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1829077f23c2SHong Zhang if (matcoloring->brows > 0) { 18309566063dSJacob Faibussowitsch PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1831e88777f2SHong Zhang } else { 18329566063dSJacob Faibussowitsch PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1833e88777f2SHong Zhang } 1834f99a636bSHong Zhang #endif 18353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18368972f759SHong Zhang } 18378972f759SHong Zhang 1838d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1839d71ae5a4SJacob Faibussowitsch { 1840e88777f2SHong Zhang PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 18411a83f524SJed Brown const PetscInt *is, *ci, *cj, *row_idx; 1842b28d80bdSHong Zhang PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1843b1683b59SHong Zhang IS *isa; 1844b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1845e88777f2SHong Zhang PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1846e88777f2SHong Zhang PetscInt *colorforcol, *columns, *columns_i, brows; 1847e88777f2SHong Zhang PetscBool flg; 1848b1683b59SHong Zhang 1849b1683b59SHong Zhang PetscFunctionBegin; 18509566063dSJacob Faibussowitsch PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1851e99be685SHong Zhang 18527ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1853e88777f2SHong Zhang Nbs = mat->cmap->N / bs; 1854b1683b59SHong Zhang c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1855e88777f2SHong Zhang c->N = Nbs; 1856e88777f2SHong Zhang c->m = c->M; 1857b1683b59SHong Zhang c->rstart = 0; 1858e88777f2SHong Zhang c->brows = 100; 1859b1683b59SHong Zhang 1860b1683b59SHong Zhang c->ncolors = nis; 18619566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 18629566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 18639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1864e88777f2SHong Zhang 1865e88777f2SHong Zhang brows = c->brows; 18669566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1867e88777f2SHong Zhang if (flg) c->brows = brows; 186848a46eb9SPierre Jolivet if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 18692205254eSKarl Rupp 1870d2853540SHong Zhang colorforrow[0] = 0; 1871d2853540SHong Zhang rows_i = rows; 1872f99a636bSHong Zhang den2sp_i = den2sp; 1873b1683b59SHong Zhang 18749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 18759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Nbs + 1, &columns)); 18762205254eSKarl Rupp 1877d2853540SHong Zhang colorforcol[0] = 0; 1878d2853540SHong Zhang columns_i = columns; 1879d2853540SHong Zhang 1880077f23c2SHong Zhang /* get column-wise storage of mat */ 18819566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1882b1683b59SHong Zhang 1883b28d80bdSHong Zhang cm = c->m; 18849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cm + 1, &rowhit)); 18859566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1886f99a636bSHong Zhang for (i = 0; i < nis; i++) { /* loop over color */ 18879566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isa[i], &n)); 18889566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isa[i], &is)); 18892205254eSKarl Rupp 1890b1683b59SHong Zhang c->ncolumns[i] = n; 18911baa6e33SBarry Smith if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1892d2853540SHong Zhang colorforcol[i + 1] = colorforcol[i] + n; 1893d2853540SHong Zhang columns_i += n; 1894b1683b59SHong Zhang 1895b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 18969566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rowhit, cm)); 1897e99be685SHong Zhang 1898b7caf3d6SHong Zhang for (j = 0; j < n; j++) { /* loop over columns*/ 1899b1683b59SHong Zhang col = is[j]; 1900d2853540SHong Zhang row_idx = cj + ci[col]; 1901b1683b59SHong Zhang m = ci[col + 1] - ci[col]; 1902b7caf3d6SHong Zhang for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1903e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1904d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1905b1683b59SHong Zhang } 1906b1683b59SHong Zhang } 1907b1683b59SHong Zhang /* count the number of hits */ 1908b1683b59SHong Zhang nrows = 0; 1909e8354b3eSHong Zhang for (j = 0; j < cm; j++) { 1910b1683b59SHong Zhang if (rowhit[j]) nrows++; 1911b1683b59SHong Zhang } 1912b1683b59SHong Zhang c->nrows[i] = nrows; 1913d2853540SHong Zhang colorforrow[i + 1] = colorforrow[i] + nrows; 1914d2853540SHong Zhang 1915b1683b59SHong Zhang nrows = 0; 1916b7caf3d6SHong Zhang for (j = 0; j < cm; j++) { /* loop over rows */ 1917b1683b59SHong Zhang if (rowhit[j]) { 1918d2853540SHong Zhang rows_i[nrows] = j; 191912b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1920b1683b59SHong Zhang nrows++; 1921b1683b59SHong Zhang } 1922b1683b59SHong Zhang } 1923e88777f2SHong Zhang den2sp_i += nrows; 1924077f23c2SHong Zhang 19259566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isa[i], &is)); 1926f99a636bSHong Zhang rows_i += nrows; 1927b1683b59SHong Zhang } 19289566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 19299566063dSJacob Faibussowitsch PetscCall(PetscFree(rowhit)); 19309566063dSJacob Faibussowitsch PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 19312c71b3e2SJacob Faibussowitsch PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1932b28d80bdSHong Zhang 1933d2853540SHong Zhang c->colorforrow = colorforrow; 1934d2853540SHong Zhang c->rows = rows; 1935f99a636bSHong Zhang c->den2sp = den2sp; 1936d2853540SHong Zhang c->colorforcol = colorforcol; 1937d2853540SHong Zhang c->columns = columns; 19382205254eSKarl Rupp 19399566063dSJacob Faibussowitsch PetscCall(PetscFree(idxhit)); 19403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1941b1683b59SHong Zhang } 1942d013fc79Sandi selinger 1943d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1944d71ae5a4SJacob Faibussowitsch { 19454222ddf1SHong Zhang Mat_Product *product = C->product; 19464222ddf1SHong Zhang Mat A = product->A, B = product->B; 19474222ddf1SHong Zhang 1948df97dc6dSFande Kong PetscFunctionBegin; 19494222ddf1SHong Zhang if (C->ops->mattransposemultnumeric) { 19504222ddf1SHong Zhang /* Alg: "outerproduct" */ 19519566063dSJacob Faibussowitsch PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 19524222ddf1SHong Zhang } else { 19534222ddf1SHong Zhang /* Alg: "matmatmult" -- C = At*B */ 19546718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 19554222ddf1SHong Zhang 195628b400f6SJacob Faibussowitsch PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 19571cdffd5eSHong Zhang if (atb->At) { 19581cdffd5eSHong Zhang /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 19591cdffd5eSHong Zhang user may have called MatProductReplaceMats() to get this A=product->A */ 19601cdffd5eSHong Zhang PetscCall(MatTransposeSetPrecursor(A, atb->At)); 19617fb60732SBarry Smith PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 19624222ddf1SHong Zhang } 19637fb60732SBarry Smith PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 19644222ddf1SHong Zhang } 19653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1966df97dc6dSFande Kong } 1967df97dc6dSFande Kong 1968d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1969d71ae5a4SJacob Faibussowitsch { 19704222ddf1SHong Zhang Mat_Product *product = C->product; 19714222ddf1SHong Zhang Mat A = product->A, B = product->B; 19724222ddf1SHong Zhang PetscReal fill = product->fill; 1973d013fc79Sandi selinger 1974d013fc79Sandi selinger PetscFunctionBegin; 19759566063dSJacob Faibussowitsch PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 19762869b61bSandi selinger 19774222ddf1SHong Zhang C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 19783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19792869b61bSandi selinger } 1980d013fc79Sandi selinger 1981d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1982d71ae5a4SJacob Faibussowitsch { 19834222ddf1SHong Zhang Mat_Product *product = C->product; 19844222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 19854222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 19864222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 19874222ddf1SHong Zhang const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 19884222ddf1SHong Zhang PetscInt nalg = 7; 19894222ddf1SHong Zhang #else 19904222ddf1SHong Zhang const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 19914222ddf1SHong Zhang PetscInt nalg = 8; 19924222ddf1SHong Zhang #endif 19934222ddf1SHong Zhang 19944222ddf1SHong Zhang PetscFunctionBegin; 19954222ddf1SHong Zhang /* Set default algorithm */ 19969566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 199748a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 1998d013fc79Sandi selinger 19994222ddf1SHong Zhang /* Get runtime option */ 20004222ddf1SHong Zhang if (product->api_user) { 2001d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 20029566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2003d0609cedSBarry Smith PetscOptionsEnd(); 20044222ddf1SHong Zhang } else { 2005d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 20069566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2007d0609cedSBarry Smith PetscOptionsEnd(); 2008d013fc79Sandi selinger } 200948a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2010d013fc79Sandi selinger 20114222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 20124222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 20133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20144222ddf1SHong Zhang } 2015d013fc79Sandi selinger 2016d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2017d71ae5a4SJacob Faibussowitsch { 20184222ddf1SHong Zhang Mat_Product *product = C->product; 20194222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20204222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 2021089a957eSStefano Zampini const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2022089a957eSStefano Zampini PetscInt nalg = 3; 2023d013fc79Sandi selinger 20244222ddf1SHong Zhang PetscFunctionBegin; 20254222ddf1SHong Zhang /* Get runtime option */ 20264222ddf1SHong Zhang if (product->api_user) { 2027d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 20289566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2029d0609cedSBarry Smith PetscOptionsEnd(); 20304222ddf1SHong Zhang } else { 2031d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 20329566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2033d0609cedSBarry Smith PetscOptionsEnd(); 20344222ddf1SHong Zhang } 203548a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2036d013fc79Sandi selinger 20374222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 20383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20394222ddf1SHong Zhang } 20404222ddf1SHong Zhang 2041d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2042d71ae5a4SJacob Faibussowitsch { 20434222ddf1SHong Zhang Mat_Product *product = C->product; 20444222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20454222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20464222ddf1SHong Zhang const char *algTypes[2] = {"default", "color"}; 20474222ddf1SHong Zhang PetscInt nalg = 2; 20484222ddf1SHong Zhang 20494222ddf1SHong Zhang PetscFunctionBegin; 20504222ddf1SHong Zhang /* Set default algorithm */ 20519566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 20524222ddf1SHong Zhang if (!flg) { 20534222ddf1SHong Zhang alg = 1; 20549566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20554222ddf1SHong Zhang } 20564222ddf1SHong Zhang 20574222ddf1SHong Zhang /* Get runtime option */ 20584222ddf1SHong Zhang if (product->api_user) { 2059d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 20609566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2061d0609cedSBarry Smith PetscOptionsEnd(); 20624222ddf1SHong Zhang } else { 2063d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 20649566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2065d0609cedSBarry Smith PetscOptionsEnd(); 20664222ddf1SHong Zhang } 206748a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20684222ddf1SHong Zhang 20694222ddf1SHong Zhang C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 20704222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABt; 20713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 20724222ddf1SHong Zhang } 20734222ddf1SHong Zhang 2074d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2075d71ae5a4SJacob Faibussowitsch { 20764222ddf1SHong Zhang Mat_Product *product = C->product; 20774222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20784222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 20794222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 20804222ddf1SHong Zhang const char *algTypes[2] = {"scalable", "rap"}; 20814222ddf1SHong Zhang PetscInt nalg = 2; 20824222ddf1SHong Zhang #else 20834222ddf1SHong Zhang const char *algTypes[3] = {"scalable", "rap", "hypre"}; 20844222ddf1SHong Zhang PetscInt nalg = 3; 20854222ddf1SHong Zhang #endif 20864222ddf1SHong Zhang 20874222ddf1SHong Zhang PetscFunctionBegin; 20884222ddf1SHong Zhang /* Set default algorithm */ 20899566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 209048a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20914222ddf1SHong Zhang 20924222ddf1SHong Zhang /* Get runtime option */ 20934222ddf1SHong Zhang if (product->api_user) { 2094d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 20959566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2096d0609cedSBarry Smith PetscOptionsEnd(); 20974222ddf1SHong Zhang } else { 2098d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 20999566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2100d0609cedSBarry Smith PetscOptionsEnd(); 21014222ddf1SHong Zhang } 210248a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21034222ddf1SHong Zhang 21044222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 21053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21064222ddf1SHong Zhang } 21074222ddf1SHong Zhang 2108d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2109d71ae5a4SJacob Faibussowitsch { 21104222ddf1SHong Zhang Mat_Product *product = C->product; 21114222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21124222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21134222ddf1SHong Zhang const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 21144222ddf1SHong Zhang PetscInt nalg = 3; 21154222ddf1SHong Zhang 21164222ddf1SHong Zhang PetscFunctionBegin; 21174222ddf1SHong Zhang /* Set default algorithm */ 21189566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 211948a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21204222ddf1SHong Zhang 21214222ddf1SHong Zhang /* Get runtime option */ 21224222ddf1SHong Zhang if (product->api_user) { 2123d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 21249566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2125d0609cedSBarry Smith PetscOptionsEnd(); 21264222ddf1SHong Zhang } else { 2127d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 21289566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2129d0609cedSBarry Smith PetscOptionsEnd(); 21304222ddf1SHong Zhang } 213148a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21324222ddf1SHong Zhang 21334222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 21343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21354222ddf1SHong Zhang } 21364222ddf1SHong Zhang 21374222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2138d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2139d71ae5a4SJacob Faibussowitsch { 21404222ddf1SHong Zhang Mat_Product *product = C->product; 21414222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21424222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21434222ddf1SHong Zhang const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 21444222ddf1SHong Zhang PetscInt nalg = 7; 21454222ddf1SHong Zhang 21464222ddf1SHong Zhang PetscFunctionBegin; 21474222ddf1SHong Zhang /* Set default algorithm */ 21489566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 214948a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21504222ddf1SHong Zhang 21514222ddf1SHong Zhang /* Get runtime option */ 21524222ddf1SHong Zhang if (product->api_user) { 2153d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 21549566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2155d0609cedSBarry Smith PetscOptionsEnd(); 21564222ddf1SHong Zhang } else { 2157d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 21589566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2159d0609cedSBarry Smith PetscOptionsEnd(); 21604222ddf1SHong Zhang } 216148a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21624222ddf1SHong Zhang 21634222ddf1SHong Zhang C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 21644222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABC; 21653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21664222ddf1SHong Zhang } 21674222ddf1SHong Zhang 2168d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2169d71ae5a4SJacob Faibussowitsch { 21704222ddf1SHong Zhang Mat_Product *product = C->product; 21714222ddf1SHong Zhang 21724222ddf1SHong Zhang PetscFunctionBegin; 21734222ddf1SHong Zhang switch (product->type) { 2174d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB: 2175d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2176d71ae5a4SJacob Faibussowitsch break; 2177d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB: 2178d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2179d71ae5a4SJacob Faibussowitsch break; 2180d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 2181d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2182d71ae5a4SJacob Faibussowitsch break; 2183d71ae5a4SJacob Faibussowitsch case MATPRODUCT_PtAP: 2184d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2185d71ae5a4SJacob Faibussowitsch break; 2186d71ae5a4SJacob Faibussowitsch case MATPRODUCT_RARt: 2187d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2188d71ae5a4SJacob Faibussowitsch break; 2189d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 2190d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2191d71ae5a4SJacob Faibussowitsch break; 2192d71ae5a4SJacob Faibussowitsch default: 2193d71ae5a4SJacob Faibussowitsch break; 21944222ddf1SHong Zhang } 21953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2196d013fc79Sandi selinger } 2197