1be1d678aSKris Buschelman 2d50806bdSBarry Smith /* 32499ec78SHong Zhang Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4d50806bdSBarry Smith C = A * B 5d50806bdSBarry Smith */ 6d50806bdSBarry Smith 7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 9c6db04a5SJed Brown #include <petscbt.h> 10af0996ceSBarry Smith #include <petsc/private/isimpl.h> 1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h> 127bab7c10SHong Zhang 13d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 14d71ae5a4SJacob Faibussowitsch { 15df97dc6dSFande Kong PetscFunctionBegin; 16dbbe0bcdSBarry Smith if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 17dbbe0bcdSBarry Smith else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 18df97dc6dSFande Kong PetscFunctionReturn(0); 19df97dc6dSFande Kong } 20df97dc6dSFande Kong 214222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */ 22d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 23d71ae5a4SJacob Faibussowitsch { 244222ddf1SHong Zhang PetscInt ii; 254222ddf1SHong Zhang Mat_SeqAIJ *aij; 26cbc6b225SStefano Zampini PetscBool isseqaij, osingle, ofree_a, ofree_ij; 275c66b693SKris Buschelman 285c66b693SKris Buschelman PetscFunctionBegin; 29aed4548fSBarry Smith PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 309566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mat, m, n, m, n)); 314222ddf1SHong Zhang 32e4e71118SStefano Zampini if (!mtype) { 339566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 349566063dSJacob Faibussowitsch if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 35e4e71118SStefano Zampini } else { 369566063dSJacob Faibussowitsch PetscCall(MatSetType(mat, mtype)); 37e4e71118SStefano Zampini } 38cbc6b225SStefano Zampini 394222ddf1SHong Zhang aij = (Mat_SeqAIJ *)(mat)->data; 40cbc6b225SStefano Zampini osingle = aij->singlemalloc; 41cbc6b225SStefano Zampini ofree_a = aij->free_a; 42cbc6b225SStefano Zampini ofree_ij = aij->free_ij; 43cbc6b225SStefano Zampini /* changes the free flags */ 449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 45cbc6b225SStefano Zampini 469566063dSJacob Faibussowitsch PetscCall(PetscFree(aij->ilen)); 479566063dSJacob Faibussowitsch PetscCall(PetscFree(aij->imax)); 489566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aij->imax)); 499566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aij->ilen)); 50cbc6b225SStefano Zampini for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 51cbc6b225SStefano Zampini const PetscInt rnz = i[ii + 1] - i[ii]; 52cbc6b225SStefano Zampini aij->nonzerorowcnt += !!rnz; 53cbc6b225SStefano Zampini aij->rmax = PetscMax(aij->rmax, rnz); 54cbc6b225SStefano Zampini aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 55cbc6b225SStefano Zampini } 56cbc6b225SStefano Zampini aij->maxnz = i[m]; 57cbc6b225SStefano Zampini aij->nz = i[m]; 584222ddf1SHong Zhang 59cbc6b225SStefano Zampini if (osingle) { 609566063dSJacob Faibussowitsch PetscCall(PetscFree3(aij->a, aij->j, aij->i)); 61cbc6b225SStefano Zampini } else { 629566063dSJacob Faibussowitsch if (ofree_a) PetscCall(PetscFree(aij->a)); 639566063dSJacob Faibussowitsch if (ofree_ij) PetscCall(PetscFree(aij->j)); 649566063dSJacob Faibussowitsch if (ofree_ij) PetscCall(PetscFree(aij->i)); 65cbc6b225SStefano Zampini } 664222ddf1SHong Zhang aij->i = i; 674222ddf1SHong Zhang aij->j = j; 684222ddf1SHong Zhang aij->a = a; 694222ddf1SHong Zhang aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 70cbc6b225SStefano Zampini /* default to not retain ownership */ 71cbc6b225SStefano Zampini aij->singlemalloc = PETSC_FALSE; 724222ddf1SHong Zhang aij->free_a = PETSC_FALSE; 734222ddf1SHong Zhang aij->free_ij = PETSC_FALSE; 749566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 755c66b693SKris Buschelman PetscFunctionReturn(0); 765c66b693SKris Buschelman } 771c24bd37SHong Zhang 78d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 79d71ae5a4SJacob Faibussowitsch { 804222ddf1SHong Zhang Mat_Product *product = C->product; 814222ddf1SHong Zhang MatProductAlgorithm alg; 824222ddf1SHong Zhang PetscBool flg; 834222ddf1SHong Zhang 844222ddf1SHong Zhang PetscFunctionBegin; 854222ddf1SHong Zhang if (product) { 864222ddf1SHong Zhang alg = product->alg; 874222ddf1SHong Zhang } else { 884222ddf1SHong Zhang alg = "sorted"; 894222ddf1SHong Zhang } 904222ddf1SHong Zhang /* sorted */ 919566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "sorted", &flg)); 924222ddf1SHong Zhang if (flg) { 939566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 944222ddf1SHong Zhang PetscFunctionReturn(0); 954222ddf1SHong Zhang } 964222ddf1SHong Zhang 974222ddf1SHong Zhang /* scalable */ 989566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "scalable", &flg)); 994222ddf1SHong Zhang if (flg) { 1009566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 1014222ddf1SHong Zhang PetscFunctionReturn(0); 1024222ddf1SHong Zhang } 1034222ddf1SHong Zhang 1044222ddf1SHong Zhang /* scalable_fast */ 1059566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 1064222ddf1SHong Zhang if (flg) { 1079566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 1084222ddf1SHong Zhang PetscFunctionReturn(0); 1094222ddf1SHong Zhang } 1104222ddf1SHong Zhang 1114222ddf1SHong Zhang /* heap */ 1129566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "heap", &flg)); 1134222ddf1SHong Zhang if (flg) { 1149566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 1154222ddf1SHong Zhang PetscFunctionReturn(0); 1164222ddf1SHong Zhang } 1174222ddf1SHong Zhang 1184222ddf1SHong Zhang /* btheap */ 1199566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "btheap", &flg)); 1204222ddf1SHong Zhang if (flg) { 1219566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 1224222ddf1SHong Zhang PetscFunctionReturn(0); 1234222ddf1SHong Zhang } 1244222ddf1SHong Zhang 1254222ddf1SHong Zhang /* llcondensed */ 1269566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 1274222ddf1SHong Zhang if (flg) { 1289566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 1294222ddf1SHong Zhang PetscFunctionReturn(0); 1304222ddf1SHong Zhang } 1314222ddf1SHong Zhang 1324222ddf1SHong Zhang /* rowmerge */ 1339566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 1344222ddf1SHong Zhang if (flg) { 1359566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 1364222ddf1SHong Zhang PetscFunctionReturn(0); 1374222ddf1SHong Zhang } 1384222ddf1SHong Zhang 1394222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE) 1409566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(alg, "hypre", &flg)); 1414222ddf1SHong Zhang if (flg) { 1429566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 1434222ddf1SHong Zhang PetscFunctionReturn(0); 1444222ddf1SHong Zhang } 1454222ddf1SHong Zhang #endif 1464222ddf1SHong Zhang 1474222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1484222ddf1SHong Zhang } 1494222ddf1SHong Zhang 150d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 151d71ae5a4SJacob Faibussowitsch { 152b561aa9dSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 153971236abSHong Zhang PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 154c1ba5575SJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 155b561aa9dSHong Zhang PetscReal afill; 156eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 157*eec179cfSJacob Faibussowitsch PetscHMapI ta; 158fb908031SHong Zhang PetscBT lnkbt; 1590298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 160b561aa9dSHong Zhang 161b561aa9dSHong Zhang PetscFunctionBegin; 162bd958071SHong Zhang /* Get ci and cj */ 163bd958071SHong Zhang /*---------------*/ 164fb908031SHong Zhang /* Allocate ci array, arrays for fill computation and */ 165fb908031SHong Zhang /* free space for accumulating nonzero column info */ 1669566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 167fb908031SHong Zhang ci[0] = 0; 168fb908031SHong Zhang 169fb908031SHong Zhang /* create and initialize a linked list */ 170*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 171c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 172*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 173*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 174eca6b86aSHong Zhang 1759566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 176fb908031SHong Zhang 177fb908031SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1789566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 1792205254eSKarl Rupp 180fb908031SHong Zhang current_space = free_space; 181fb908031SHong Zhang 182fb908031SHong Zhang /* Determine ci and cj */ 183fb908031SHong Zhang for (i = 0; i < am; i++) { 184fb908031SHong Zhang anzi = ai[i + 1] - ai[i]; 185fb908031SHong Zhang aj = a->j + ai[i]; 186fb908031SHong Zhang for (j = 0; j < anzi; j++) { 187fb908031SHong Zhang brow = aj[j]; 188fb908031SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 189fb908031SHong Zhang bj = b->j + bi[brow]; 190fb908031SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 1919566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 192fb908031SHong Zhang } 1938c78258cSHong Zhang /* add possible missing diagonal entry */ 19448a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 195fb908031SHong Zhang cnzi = lnk[0]; 196fb908031SHong Zhang 197fb908031SHong Zhang /* If free space is not available, make more free space */ 198fb908031SHong Zhang /* Double the amount of total space in the list */ 199fb908031SHong Zhang if (current_space->local_remaining < cnzi) { 2009566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 201fb908031SHong Zhang ndouble++; 202fb908031SHong Zhang } 203fb908031SHong Zhang 204fb908031SHong Zhang /* Copy data into free space, then initialize lnk */ 2059566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 2062205254eSKarl Rupp 207fb908031SHong Zhang current_space->array += cnzi; 208fb908031SHong Zhang current_space->local_used += cnzi; 209fb908031SHong Zhang current_space->local_remaining -= cnzi; 2102205254eSKarl Rupp 211fb908031SHong Zhang ci[i + 1] = ci[i] + cnzi; 212fb908031SHong Zhang } 213fb908031SHong Zhang 214fb908031SHong Zhang /* Column indices are in the list of free space */ 215fb908031SHong Zhang /* Allocate space for cj, initialize cj, and */ 216fb908031SHong Zhang /* destroy list of free space and other temporary array(s) */ 2179566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 2189566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 2199566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 220b561aa9dSHong Zhang 22126be0446SHong Zhang /* put together the new symbolic matrix */ 2229566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 2239566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 22458c24d83SHong Zhang 22558c24d83SHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 22658c24d83SHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 2274222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 228aa1aec99SHong Zhang c->free_a = PETSC_FALSE; 229e6b907acSBarry Smith c->free_ij = PETSC_TRUE; 23058c24d83SHong Zhang c->nonew = 0; 2314222ddf1SHong Zhang 2324222ddf1SHong Zhang /* fast, needs non-scalable O(bn) array 'abdense' */ 2334222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 2340b7e3e3dSHong Zhang 2357212da7cSHong Zhang /* set MatInfo */ 2367212da7cSHong Zhang afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 237f2b054eeSHong Zhang if (afill < 1.0) afill = 1.0; 2384222ddf1SHong Zhang C->info.mallocs = ndouble; 2394222ddf1SHong Zhang C->info.fill_ratio_given = fill; 2404222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 2417212da7cSHong Zhang 2427212da7cSHong Zhang #if defined(PETSC_USE_INFO) 2437212da7cSHong Zhang if (ci[am]) { 2449566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 2459566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 246f2b054eeSHong Zhang } else { 2479566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 248be0fcf8dSHong Zhang } 249f2b054eeSHong Zhang #endif 25058c24d83SHong Zhang PetscFunctionReturn(0); 25158c24d83SHong Zhang } 252d50806bdSBarry Smith 253d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 254d71ae5a4SJacob Faibussowitsch { 255d13dce4bSSatish Balay PetscLogDouble flops = 0.0; 256d50806bdSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 257d50806bdSBarry Smith Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 258d50806bdSBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 25938baddfdSBarry Smith PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 260c58ca2e3SHong Zhang PetscInt am = A->rmap->n, cm = C->rmap->n; 261519eb980SHong Zhang PetscInt i, j, k, anzi, bnzi, cnzi, brow; 2622e5835c6SStefano Zampini PetscScalar *ca, valtmp; 263aa1aec99SHong Zhang PetscScalar *ab_dense; 2646718818eSStefano Zampini PetscContainer cab_dense; 2652e5835c6SStefano Zampini const PetscScalar *aa, *ba, *baj; 266d50806bdSBarry Smith 267d50806bdSBarry Smith PetscFunctionBegin; 2689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 2699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 270aa1aec99SHong Zhang if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 2719566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 272aa1aec99SHong Zhang c->a = ca; 273aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 2746718818eSStefano Zampini } else ca = c->a; 2756718818eSStefano Zampini 2766718818eSStefano Zampini /* TODO this should be done in the symbolic phase */ 2776718818eSStefano Zampini /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 2786718818eSStefano Zampini that is hard to eradicate) */ 2799566063dSJacob Faibussowitsch PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 2806718818eSStefano Zampini if (!cab_dense) { 2819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 2829566063dSJacob Faibussowitsch PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 2839566063dSJacob Faibussowitsch PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 2849566063dSJacob Faibussowitsch PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 2859566063dSJacob Faibussowitsch PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 2869566063dSJacob Faibussowitsch PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 287d90d86d0SMatthew G. Knepley } 2889566063dSJacob Faibussowitsch PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 2899566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 290aa1aec99SHong Zhang 291c124e916SHong Zhang /* clean old values in C */ 2929566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 293d50806bdSBarry Smith /* Traverse A row-wise. */ 294d50806bdSBarry Smith /* Build the ith row in C by summing over nonzero columns in A, */ 295d50806bdSBarry Smith /* the rows of B corresponding to nonzeros of A. */ 296d50806bdSBarry Smith for (i = 0; i < am; i++) { 297d50806bdSBarry Smith anzi = ai[i + 1] - ai[i]; 298d50806bdSBarry Smith for (j = 0; j < anzi; j++) { 299519eb980SHong Zhang brow = aj[j]; 300d50806bdSBarry Smith bnzi = bi[brow + 1] - bi[brow]; 301d50806bdSBarry Smith bjj = bj + bi[brow]; 302d50806bdSBarry Smith baj = ba + bi[brow]; 303519eb980SHong Zhang /* perform dense axpy */ 30436ec6d2dSHong Zhang valtmp = aa[j]; 305ad540459SPierre Jolivet for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 306519eb980SHong Zhang flops += 2 * bnzi; 307519eb980SHong Zhang } 3089371c9d4SSatish Balay aj += anzi; 3099371c9d4SSatish Balay aa += anzi; 310c58ca2e3SHong Zhang 311c58ca2e3SHong Zhang cnzi = ci[i + 1] - ci[i]; 312519eb980SHong Zhang for (k = 0; k < cnzi; k++) { 313519eb980SHong Zhang ca[k] += ab_dense[cj[k]]; 314519eb980SHong Zhang ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 315519eb980SHong Zhang } 316519eb980SHong Zhang flops += cnzi; 3179371c9d4SSatish Balay cj += cnzi; 3189371c9d4SSatish Balay ca += cnzi; 319519eb980SHong Zhang } 3202e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3212e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3222e5835c6SStefano Zampini #endif 3239566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 3249566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 3259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 3269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 3279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 328c58ca2e3SHong Zhang PetscFunctionReturn(0); 329c58ca2e3SHong Zhang } 330c58ca2e3SHong Zhang 331d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 332d71ae5a4SJacob Faibussowitsch { 333c58ca2e3SHong Zhang PetscLogDouble flops = 0.0; 334c58ca2e3SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 335c58ca2e3SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 336c58ca2e3SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 337c58ca2e3SHong Zhang PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 338c58ca2e3SHong Zhang PetscInt am = A->rmap->N, cm = C->rmap->N; 339c58ca2e3SHong Zhang PetscInt i, j, k, anzi, bnzi, cnzi, brow; 3402e5835c6SStefano Zampini PetscScalar *ca = c->a, valtmp; 3412e5835c6SStefano Zampini const PetscScalar *aa, *ba, *baj; 342c58ca2e3SHong Zhang PetscInt nextb; 343c58ca2e3SHong Zhang 344c58ca2e3SHong Zhang PetscFunctionBegin; 3459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 3469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 347cf742fccSHong Zhang if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 3489566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 349cf742fccSHong Zhang c->a = ca; 350cf742fccSHong Zhang c->free_a = PETSC_TRUE; 351cf742fccSHong Zhang } 352cf742fccSHong Zhang 353c58ca2e3SHong Zhang /* clean old values in C */ 3549566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 355c58ca2e3SHong Zhang /* Traverse A row-wise. */ 356c58ca2e3SHong Zhang /* Build the ith row in C by summing over nonzero columns in A, */ 357c58ca2e3SHong Zhang /* the rows of B corresponding to nonzeros of A. */ 358519eb980SHong Zhang for (i = 0; i < am; i++) { 359519eb980SHong Zhang anzi = ai[i + 1] - ai[i]; 360519eb980SHong Zhang cnzi = ci[i + 1] - ci[i]; 361519eb980SHong Zhang for (j = 0; j < anzi; j++) { 362519eb980SHong Zhang brow = aj[j]; 363519eb980SHong Zhang bnzi = bi[brow + 1] - bi[brow]; 364519eb980SHong Zhang bjj = bj + bi[brow]; 365519eb980SHong Zhang baj = ba + bi[brow]; 366519eb980SHong Zhang /* perform sparse axpy */ 36736ec6d2dSHong Zhang valtmp = aa[j]; 368c124e916SHong Zhang nextb = 0; 369c124e916SHong Zhang for (k = 0; nextb < bnzi; k++) { 370c124e916SHong Zhang if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 37136ec6d2dSHong Zhang ca[k] += valtmp * baj[nextb++]; 372c124e916SHong Zhang } 373d50806bdSBarry Smith } 374d50806bdSBarry Smith flops += 2 * bnzi; 375d50806bdSBarry Smith } 3769371c9d4SSatish Balay aj += anzi; 3779371c9d4SSatish Balay aa += anzi; 3789371c9d4SSatish Balay cj += cnzi; 3799371c9d4SSatish Balay ca += cnzi; 380d50806bdSBarry Smith } 3812e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE) 3822e5835c6SStefano Zampini if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 3832e5835c6SStefano Zampini #endif 3849566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 3859566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 3869566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 3879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 3889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 389d50806bdSBarry Smith PetscFunctionReturn(0); 390d50806bdSBarry Smith } 391bc011b1eSHong Zhang 392d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 393d71ae5a4SJacob Faibussowitsch { 39425296bd5SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 39525296bd5SBarry Smith PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 3963c50cad2SHong Zhang PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 39725296bd5SBarry Smith MatScalar *ca; 39825296bd5SBarry Smith PetscReal afill; 399eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 400*eec179cfSJacob Faibussowitsch PetscHMapI ta; 4010298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 40225296bd5SBarry Smith 40325296bd5SBarry Smith PetscFunctionBegin; 4043c50cad2SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 4053c50cad2SHong Zhang /*-----------------------------------------------------------------------------------------*/ 4063c50cad2SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 4079566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 4083c50cad2SHong Zhang ci[0] = 0; 4093c50cad2SHong Zhang 4103c50cad2SHong Zhang /* create and initialize a linked list */ 411*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 412c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 413*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 414*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 415eca6b86aSHong Zhang 4169566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 4173c50cad2SHong Zhang 4183c50cad2SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 4199566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 4203c50cad2SHong Zhang current_space = free_space; 4213c50cad2SHong Zhang 4223c50cad2SHong Zhang /* Determine ci and cj */ 4233c50cad2SHong Zhang for (i = 0; i < am; i++) { 4243c50cad2SHong Zhang anzi = ai[i + 1] - ai[i]; 4253c50cad2SHong Zhang aj = a->j + ai[i]; 4263c50cad2SHong Zhang for (j = 0; j < anzi; j++) { 4273c50cad2SHong Zhang brow = aj[j]; 4283c50cad2SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 4293c50cad2SHong Zhang bj = b->j + bi[brow]; 4303c50cad2SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 4319566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 4323c50cad2SHong Zhang } 4338c78258cSHong Zhang /* add possible missing diagonal entry */ 43448a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 4353c50cad2SHong Zhang cnzi = lnk[1]; 4363c50cad2SHong Zhang 4373c50cad2SHong Zhang /* If free space is not available, make more free space */ 4383c50cad2SHong Zhang /* Double the amount of total space in the list */ 4393c50cad2SHong Zhang if (current_space->local_remaining < cnzi) { 4409566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 4413c50cad2SHong Zhang ndouble++; 4423c50cad2SHong Zhang } 4433c50cad2SHong Zhang 4443c50cad2SHong Zhang /* Copy data into free space, then initialize lnk */ 4459566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 4462205254eSKarl Rupp 4473c50cad2SHong Zhang current_space->array += cnzi; 4483c50cad2SHong Zhang current_space->local_used += cnzi; 4493c50cad2SHong Zhang current_space->local_remaining -= cnzi; 4502205254eSKarl Rupp 4513c50cad2SHong Zhang ci[i + 1] = ci[i] + cnzi; 4523c50cad2SHong Zhang } 4533c50cad2SHong Zhang 4543c50cad2SHong Zhang /* Column indices are in the list of free space */ 4553c50cad2SHong Zhang /* Allocate space for cj, initialize cj, and */ 4563c50cad2SHong Zhang /* destroy list of free space and other temporary array(s) */ 4579566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 4589566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 4599566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy_fast(lnk)); 46025296bd5SBarry Smith 46125296bd5SBarry Smith /* Allocate space for ca */ 4629566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 46325296bd5SBarry Smith 46425296bd5SBarry Smith /* put together the new symbolic matrix */ 4659566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 4669566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 46725296bd5SBarry Smith 46825296bd5SBarry Smith /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 46925296bd5SBarry Smith /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 4704222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 47125296bd5SBarry Smith c->free_a = PETSC_TRUE; 47225296bd5SBarry Smith c->free_ij = PETSC_TRUE; 47325296bd5SBarry Smith c->nonew = 0; 4742205254eSKarl Rupp 4754222ddf1SHong Zhang /* slower, less memory */ 4764222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 47725296bd5SBarry Smith 47825296bd5SBarry Smith /* set MatInfo */ 47925296bd5SBarry Smith afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 48025296bd5SBarry Smith if (afill < 1.0) afill = 1.0; 4814222ddf1SHong Zhang C->info.mallocs = ndouble; 4824222ddf1SHong Zhang C->info.fill_ratio_given = fill; 4834222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 48425296bd5SBarry Smith 48525296bd5SBarry Smith #if defined(PETSC_USE_INFO) 48625296bd5SBarry Smith if (ci[am]) { 4879566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 4889566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 48925296bd5SBarry Smith } else { 4909566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 49125296bd5SBarry Smith } 49225296bd5SBarry Smith #endif 49325296bd5SBarry Smith PetscFunctionReturn(0); 49425296bd5SBarry Smith } 49525296bd5SBarry Smith 496d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 497d71ae5a4SJacob Faibussowitsch { 498e9e4536cSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 499bf9555e6SHong Zhang PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 50025c41797SHong Zhang PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 501e9e4536cSHong Zhang MatScalar *ca; 502e9e4536cSHong Zhang PetscReal afill; 503eca6b86aSHong Zhang PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 504*eec179cfSJacob Faibussowitsch PetscHMapI ta; 5050298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 506e9e4536cSHong Zhang 507e9e4536cSHong Zhang PetscFunctionBegin; 508bd958071SHong Zhang /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 509bd958071SHong Zhang /*---------------------------------------------------------------------------------------------*/ 510bd958071SHong Zhang /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 5119566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 512bd958071SHong Zhang ci[0] = 0; 513bd958071SHong Zhang 514bd958071SHong Zhang /* create and initialize a linked list */ 515*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapICreateWithSize(bn, &ta)); 516c373ccc6SHong Zhang MatRowMergeMax_SeqAIJ(b, bm, ta); 517*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIGetSize(ta, &Crmax)); 518*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&ta)); 5199566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 520bd958071SHong Zhang 521bd958071SHong Zhang /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 5229566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 523bd958071SHong Zhang current_space = free_space; 524bd958071SHong Zhang 525bd958071SHong Zhang /* Determine ci and cj */ 526bd958071SHong Zhang for (i = 0; i < am; i++) { 527bd958071SHong Zhang anzi = ai[i + 1] - ai[i]; 528bd958071SHong Zhang aj = a->j + ai[i]; 529bd958071SHong Zhang for (j = 0; j < anzi; j++) { 530bd958071SHong Zhang brow = aj[j]; 531bd958071SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 532bd958071SHong Zhang bj = b->j + bi[brow]; 533bd958071SHong Zhang /* add non-zero cols of B into the sorted linked list lnk */ 5349566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 535bd958071SHong Zhang } 5368c78258cSHong Zhang /* add possible missing diagonal entry */ 53748a46eb9SPierre Jolivet if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 5388c78258cSHong Zhang 539bd958071SHong Zhang cnzi = lnk[0]; 540bd958071SHong Zhang 541bd958071SHong Zhang /* If free space is not available, make more free space */ 542bd958071SHong Zhang /* Double the amount of total space in the list */ 543bd958071SHong Zhang if (current_space->local_remaining < cnzi) { 5449566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 545bd958071SHong Zhang ndouble++; 546bd958071SHong Zhang } 547bd958071SHong Zhang 548bd958071SHong Zhang /* Copy data into free space, then initialize lnk */ 5499566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 5502205254eSKarl Rupp 551bd958071SHong Zhang current_space->array += cnzi; 552bd958071SHong Zhang current_space->local_used += cnzi; 553bd958071SHong Zhang current_space->local_remaining -= cnzi; 5542205254eSKarl Rupp 555bd958071SHong Zhang ci[i + 1] = ci[i] + cnzi; 556bd958071SHong Zhang } 557bd958071SHong Zhang 558bd958071SHong Zhang /* Column indices are in the list of free space */ 559bd958071SHong Zhang /* Allocate space for cj, initialize cj, and */ 560bd958071SHong Zhang /* destroy list of free space and other temporary array(s) */ 5619566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 5629566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 5639566063dSJacob Faibussowitsch PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 564e9e4536cSHong Zhang 565e9e4536cSHong Zhang /* Allocate space for ca */ 566bd958071SHong Zhang /*-----------------------*/ 5679566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 568e9e4536cSHong Zhang 569e9e4536cSHong Zhang /* put together the new symbolic matrix */ 5709566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 5719566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 572e9e4536cSHong Zhang 573e9e4536cSHong Zhang /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 574e9e4536cSHong Zhang /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 5754222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 576e9e4536cSHong Zhang c->free_a = PETSC_TRUE; 577e9e4536cSHong Zhang c->free_ij = PETSC_TRUE; 578e9e4536cSHong Zhang c->nonew = 0; 5792205254eSKarl Rupp 5804222ddf1SHong Zhang /* slower, less memory */ 5814222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 582e9e4536cSHong Zhang 583e9e4536cSHong Zhang /* set MatInfo */ 584e9e4536cSHong Zhang afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 585e9e4536cSHong Zhang if (afill < 1.0) afill = 1.0; 5864222ddf1SHong Zhang C->info.mallocs = ndouble; 5874222ddf1SHong Zhang C->info.fill_ratio_given = fill; 5884222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 589e9e4536cSHong Zhang 590e9e4536cSHong Zhang #if defined(PETSC_USE_INFO) 591e9e4536cSHong Zhang if (ci[am]) { 5929566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 5939566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 594e9e4536cSHong Zhang } else { 5959566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 596e9e4536cSHong Zhang } 597e9e4536cSHong Zhang #endif 598e9e4536cSHong Zhang PetscFunctionReturn(0); 599e9e4536cSHong Zhang } 600e9e4536cSHong Zhang 601d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 602d71ae5a4SJacob Faibussowitsch { 6030ced3a2bSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 6040ced3a2bSJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 6050ced3a2bSJed Brown PetscInt *ci, *cj, *bb; 6060ced3a2bSJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 6070ced3a2bSJed Brown PetscReal afill; 6080ced3a2bSJed Brown PetscInt i, j, col, ndouble = 0; 6090298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 6100ced3a2bSJed Brown PetscHeap h; 6110ced3a2bSJed Brown 6120ced3a2bSJed Brown PetscFunctionBegin; 613cd093f1dSJed Brown /* Get ci and cj - by merging sorted rows using a heap */ 6140ced3a2bSJed Brown /*---------------------------------------------------------------------------------------------*/ 6150ced3a2bSJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 6169566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 6170ced3a2bSJed Brown ci[0] = 0; 6180ced3a2bSJed Brown 6190ced3a2bSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 6209566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 6210ced3a2bSJed Brown current_space = free_space; 6220ced3a2bSJed Brown 6239566063dSJacob Faibussowitsch PetscCall(PetscHeapCreate(a->rmax, &h)); 6249566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->rmax, &bb)); 6250ced3a2bSJed Brown 6260ced3a2bSJed Brown /* Determine ci and cj */ 6270ced3a2bSJed Brown for (i = 0; i < am; i++) { 6280ced3a2bSJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 6290ced3a2bSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 6300ced3a2bSJed Brown ci[i + 1] = ci[i]; 6310ced3a2bSJed Brown /* Populate the min heap */ 6320ced3a2bSJed Brown for (j = 0; j < anzi; j++) { 6330ced3a2bSJed Brown bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 6340ced3a2bSJed Brown if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 6359566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 6360ced3a2bSJed Brown } 6370ced3a2bSJed Brown } 6380ced3a2bSJed Brown /* Pick off the min element, adding it to free space */ 6399566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 6400ced3a2bSJed Brown while (j >= 0) { 6410ced3a2bSJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 6429566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 6430ced3a2bSJed Brown ndouble++; 6440ced3a2bSJed Brown } 6450ced3a2bSJed Brown *(current_space->array++) = col; 6460ced3a2bSJed Brown current_space->local_used++; 6470ced3a2bSJed Brown current_space->local_remaining--; 6480ced3a2bSJed Brown ci[i + 1]++; 6490ced3a2bSJed Brown 6500ced3a2bSJed Brown /* stash if anything else remains in this row of B */ 6519566063dSJacob Faibussowitsch if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 6520ced3a2bSJed Brown while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 6530ced3a2bSJed Brown PetscInt j2, col2; 6549566063dSJacob Faibussowitsch PetscCall(PetscHeapPeek(h, &j2, &col2)); 6550ced3a2bSJed Brown if (col2 != col) break; 6569566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j2, &col2)); 6579566063dSJacob Faibussowitsch if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 6580ced3a2bSJed Brown } 6590ced3a2bSJed Brown /* Put any stashed elements back into the min heap */ 6609566063dSJacob Faibussowitsch PetscCall(PetscHeapUnstash(h)); 6619566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 6620ced3a2bSJed Brown } 6630ced3a2bSJed Brown } 6649566063dSJacob Faibussowitsch PetscCall(PetscFree(bb)); 6659566063dSJacob Faibussowitsch PetscCall(PetscHeapDestroy(&h)); 6660ced3a2bSJed Brown 6670ced3a2bSJed Brown /* Column indices are in the list of free space */ 6680ced3a2bSJed Brown /* Allocate space for cj, initialize cj, and */ 6690ced3a2bSJed Brown /* destroy list of free space and other temporary array(s) */ 6709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am], &cj)); 6719566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 6720ced3a2bSJed Brown 6730ced3a2bSJed Brown /* put together the new symbolic matrix */ 6749566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 6759566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 6760ced3a2bSJed Brown 6770ced3a2bSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 6780ced3a2bSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 6794222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 6800ced3a2bSJed Brown c->free_a = PETSC_TRUE; 6810ced3a2bSJed Brown c->free_ij = PETSC_TRUE; 6820ced3a2bSJed Brown c->nonew = 0; 68326fbe8dcSKarl Rupp 6844222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 6850ced3a2bSJed Brown 6860ced3a2bSJed Brown /* set MatInfo */ 6870ced3a2bSJed Brown afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 6880ced3a2bSJed Brown if (afill < 1.0) afill = 1.0; 6894222ddf1SHong Zhang C->info.mallocs = ndouble; 6904222ddf1SHong Zhang C->info.fill_ratio_given = fill; 6914222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 6920ced3a2bSJed Brown 6930ced3a2bSJed Brown #if defined(PETSC_USE_INFO) 6940ced3a2bSJed Brown if (ci[am]) { 6959566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 6969566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 6970ced3a2bSJed Brown } else { 6989566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 6990ced3a2bSJed Brown } 7000ced3a2bSJed Brown #endif 7010ced3a2bSJed Brown PetscFunctionReturn(0); 7020ced3a2bSJed Brown } 703e9e4536cSHong Zhang 704d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 705d71ae5a4SJacob Faibussowitsch { 7068a07c6f1SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 7078a07c6f1SJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 7088a07c6f1SJed Brown PetscInt *ci, *cj, *bb; 7098a07c6f1SJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 7108a07c6f1SJed Brown PetscReal afill; 7118a07c6f1SJed Brown PetscInt i, j, col, ndouble = 0; 7120298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 7138a07c6f1SJed Brown PetscHeap h; 7148a07c6f1SJed Brown PetscBT bt; 7158a07c6f1SJed Brown 7168a07c6f1SJed Brown PetscFunctionBegin; 717cd093f1dSJed Brown /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 7188a07c6f1SJed Brown /*---------------------------------------------------------------------------------------------*/ 7198a07c6f1SJed Brown /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 7209566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 2, &ci)); 7218a07c6f1SJed Brown ci[0] = 0; 7228a07c6f1SJed Brown 7238a07c6f1SJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 7249566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 7252205254eSKarl Rupp 7268a07c6f1SJed Brown current_space = free_space; 7278a07c6f1SJed Brown 7289566063dSJacob Faibussowitsch PetscCall(PetscHeapCreate(a->rmax, &h)); 7299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->rmax, &bb)); 7309566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(bn, &bt)); 7318a07c6f1SJed Brown 7328a07c6f1SJed Brown /* Determine ci and cj */ 7338a07c6f1SJed Brown for (i = 0; i < am; i++) { 7348a07c6f1SJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 7358a07c6f1SJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 7368a07c6f1SJed Brown const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 7378a07c6f1SJed Brown ci[i + 1] = ci[i]; 7388a07c6f1SJed Brown /* Populate the min heap */ 7398a07c6f1SJed Brown for (j = 0; j < anzi; j++) { 7408a07c6f1SJed Brown PetscInt brow = acol[j]; 7418a07c6f1SJed Brown for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 7428a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7438a07c6f1SJed Brown if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 7449566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bcol)); 7458a07c6f1SJed Brown bb[j]++; 7468a07c6f1SJed Brown break; 7478a07c6f1SJed Brown } 7488a07c6f1SJed Brown } 7498a07c6f1SJed Brown } 7508a07c6f1SJed Brown /* Pick off the min element, adding it to free space */ 7519566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 7528a07c6f1SJed Brown while (j >= 0) { 7538a07c6f1SJed Brown if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 7540298fd71SBarry Smith fptr = NULL; /* need PetscBTMemzero */ 7559566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 7568a07c6f1SJed Brown ndouble++; 7578a07c6f1SJed Brown } 7588a07c6f1SJed Brown *(current_space->array++) = col; 7598a07c6f1SJed Brown current_space->local_used++; 7608a07c6f1SJed Brown current_space->local_remaining--; 7618a07c6f1SJed Brown ci[i + 1]++; 7628a07c6f1SJed Brown 7638a07c6f1SJed Brown /* stash if anything else remains in this row of B */ 7648a07c6f1SJed Brown for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 7658a07c6f1SJed Brown PetscInt bcol = bj[bb[j]]; 7668a07c6f1SJed Brown if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 7679566063dSJacob Faibussowitsch PetscCall(PetscHeapAdd(h, j, bcol)); 7688a07c6f1SJed Brown bb[j]++; 7698a07c6f1SJed Brown break; 7708a07c6f1SJed Brown } 7718a07c6f1SJed Brown } 7729566063dSJacob Faibussowitsch PetscCall(PetscHeapPop(h, &j, &col)); 7738a07c6f1SJed Brown } 7748a07c6f1SJed Brown if (fptr) { /* Clear the bits for this row */ 7759566063dSJacob Faibussowitsch for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 7768a07c6f1SJed Brown } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 7779566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(bn, bt)); 7788a07c6f1SJed Brown } 7798a07c6f1SJed Brown } 7809566063dSJacob Faibussowitsch PetscCall(PetscFree(bb)); 7819566063dSJacob Faibussowitsch PetscCall(PetscHeapDestroy(&h)); 7829566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&bt)); 7838a07c6f1SJed Brown 7848a07c6f1SJed Brown /* Column indices are in the list of free space */ 7858a07c6f1SJed Brown /* Allocate space for cj, initialize cj, and */ 7868a07c6f1SJed Brown /* destroy list of free space and other temporary array(s) */ 7879566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ci[am], &cj)); 7889566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 7898a07c6f1SJed Brown 7908a07c6f1SJed Brown /* put together the new symbolic matrix */ 7919566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 7929566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 7938a07c6f1SJed Brown 7948a07c6f1SJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 7958a07c6f1SJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 7964222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 7978a07c6f1SJed Brown c->free_a = PETSC_TRUE; 7988a07c6f1SJed Brown c->free_ij = PETSC_TRUE; 7998a07c6f1SJed Brown c->nonew = 0; 80026fbe8dcSKarl Rupp 8014222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 8028a07c6f1SJed Brown 8038a07c6f1SJed Brown /* set MatInfo */ 8048a07c6f1SJed Brown afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 8058a07c6f1SJed Brown if (afill < 1.0) afill = 1.0; 8064222ddf1SHong Zhang C->info.mallocs = ndouble; 8074222ddf1SHong Zhang C->info.fill_ratio_given = fill; 8084222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 8098a07c6f1SJed Brown 8108a07c6f1SJed Brown #if defined(PETSC_USE_INFO) 8118a07c6f1SJed Brown if (ci[am]) { 8129566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 8139566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 8148a07c6f1SJed Brown } else { 8159566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 8168a07c6f1SJed Brown } 8178a07c6f1SJed Brown #endif 8188a07c6f1SJed Brown PetscFunctionReturn(0); 8198a07c6f1SJed Brown } 8208a07c6f1SJed Brown 821d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 822d71ae5a4SJacob Faibussowitsch { 823d7ed1a4dSandi selinger Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 824d7ed1a4dSandi selinger const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 825d7ed1a4dSandi selinger PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 826d7ed1a4dSandi selinger PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 827d7ed1a4dSandi selinger const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 828d7ed1a4dSandi selinger const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 829d7ed1a4dSandi selinger const PetscInt *brow_ptr[8], *brow_end[8]; 830d7ed1a4dSandi selinger PetscInt window[8]; 831d7ed1a4dSandi selinger PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 832d7ed1a4dSandi selinger PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 833d7ed1a4dSandi selinger PetscReal afill; 834f83700f2Sandi selinger PetscInt *workj_L1, *workj_L2, *workj_L3; 8357660c4dbSandi selinger PetscInt L1_nnz, L2_nnz; 836d7ed1a4dSandi selinger 837d7ed1a4dSandi selinger /* Step 1: Get upper bound on memory required for allocation. 838d7ed1a4dSandi selinger Because of the way virtual memory works, 839d7ed1a4dSandi selinger only the memory pages that are actually needed will be physically allocated. */ 840d7ed1a4dSandi selinger PetscFunctionBegin; 8419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ci)); 842d7ed1a4dSandi selinger for (i = 0; i < am; i++) { 843d7ed1a4dSandi selinger const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 844d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 845d7ed1a4dSandi selinger a_rownnz = 0; 846d7ed1a4dSandi selinger for (k = 0; k < anzi; ++k) { 847d7ed1a4dSandi selinger a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 848d7ed1a4dSandi selinger if (a_rownnz > bn) { 849d7ed1a4dSandi selinger a_rownnz = bn; 850d7ed1a4dSandi selinger break; 851d7ed1a4dSandi selinger } 852d7ed1a4dSandi selinger } 853d7ed1a4dSandi selinger a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 854d7ed1a4dSandi selinger } 855d7ed1a4dSandi selinger /* temporary work areas for merging rows */ 8569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 8579566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 8589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 859d7ed1a4dSandi selinger 8607660c4dbSandi selinger /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 8617660c4dbSandi selinger c_maxmem = 8 * (ai[am] + bi[bm]); 862d7ed1a4dSandi selinger /* Step 2: Populate pattern for C */ 8639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c_maxmem, &cj)); 864d7ed1a4dSandi selinger 865d7ed1a4dSandi selinger ci_nnz = 0; 866d7ed1a4dSandi selinger ci[0] = 0; 867d7ed1a4dSandi selinger worki_L1[0] = 0; 868d7ed1a4dSandi selinger worki_L2[0] = 0; 869d7ed1a4dSandi selinger for (i = 0; i < am; i++) { 870d7ed1a4dSandi selinger const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 871d7ed1a4dSandi selinger const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 872d7ed1a4dSandi selinger rowsleft = anzi; 873d7ed1a4dSandi selinger inputcol_L1 = acol; 874d7ed1a4dSandi selinger L2_nnz = 0; 8757660c4dbSandi selinger L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 8767660c4dbSandi selinger worki_L2[1] = 0; 87708fe1b3cSKarl Rupp outputi_nnz = 0; 878d7ed1a4dSandi selinger 879d7ed1a4dSandi selinger /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 880d7ed1a4dSandi selinger while (ci_nnz + a_maxrownnz > c_maxmem) { 881d7ed1a4dSandi selinger c_maxmem *= 2; 882d7ed1a4dSandi selinger ndouble++; 8839566063dSJacob Faibussowitsch PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 884d7ed1a4dSandi selinger } 885d7ed1a4dSandi selinger 886d7ed1a4dSandi selinger while (rowsleft) { 887d7ed1a4dSandi selinger L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 888d7ed1a4dSandi selinger L1_nrows = 0; 8897660c4dbSandi selinger L1_nnz = 0; 890d7ed1a4dSandi selinger inputcol = inputcol_L1; 891d7ed1a4dSandi selinger inputi = bi; 892d7ed1a4dSandi selinger inputj = bj; 893d7ed1a4dSandi selinger 894d7ed1a4dSandi selinger /* The following macro is used to specialize for small rows in A. 895d7ed1a4dSandi selinger This helps with compiler unrolling, improving performance substantially. 896f83700f2Sandi selinger Input: inputj inputi inputcol bn 897d7ed1a4dSandi selinger Output: outputj outputi_nnz */ 898d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 899d7ed1a4dSandi selinger window_min = bn; \ 9007660c4dbSandi selinger outputi_nnz = 0; \ 9017660c4dbSandi selinger for (k = 0; k < ANNZ; ++k) { \ 902d7ed1a4dSandi selinger brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 903d7ed1a4dSandi selinger brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 904d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 905d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 906d7ed1a4dSandi selinger } \ 907d7ed1a4dSandi selinger while (window_min < bn) { \ 908d7ed1a4dSandi selinger outputj[outputi_nnz++] = window_min; \ 909d7ed1a4dSandi selinger /* advance front and compute new minimum */ \ 910d7ed1a4dSandi selinger old_window_min = window_min; \ 911d7ed1a4dSandi selinger window_min = bn; \ 912d7ed1a4dSandi selinger for (k = 0; k < ANNZ; ++k) { \ 913d7ed1a4dSandi selinger if (window[k] == old_window_min) { \ 914d7ed1a4dSandi selinger brow_ptr[k]++; \ 915d7ed1a4dSandi selinger window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 916d7ed1a4dSandi selinger } \ 917d7ed1a4dSandi selinger window_min = PetscMin(window[k], window_min); \ 918d7ed1a4dSandi selinger } \ 919d7ed1a4dSandi selinger } 920d7ed1a4dSandi selinger 921d7ed1a4dSandi selinger /************** L E V E L 1 ***************/ 922d7ed1a4dSandi selinger /* Merge up to 8 rows of B to L1 work array*/ 923d7ed1a4dSandi selinger while (L1_rowsleft) { 9247660c4dbSandi selinger outputi_nnz = 0; 9257660c4dbSandi selinger if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 9267660c4dbSandi selinger else outputj = cj + ci_nnz; /* Merge directly to C */ 9277660c4dbSandi selinger 928d7ed1a4dSandi selinger switch (L1_rowsleft) { 9299371c9d4SSatish Balay case 1: 9309371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 931d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 932d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 933d7ed1a4dSandi selinger inputcol += L1_rowsleft; 934d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 935d7ed1a4dSandi selinger L1_rowsleft = 0; 936d7ed1a4dSandi selinger break; 9379371c9d4SSatish Balay case 2: 9389371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(2); 939d7ed1a4dSandi selinger inputcol += L1_rowsleft; 940d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 941d7ed1a4dSandi selinger L1_rowsleft = 0; 942d7ed1a4dSandi selinger break; 9439371c9d4SSatish Balay case 3: 9449371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(3); 945d7ed1a4dSandi selinger inputcol += L1_rowsleft; 946d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 947d7ed1a4dSandi selinger L1_rowsleft = 0; 948d7ed1a4dSandi selinger break; 9499371c9d4SSatish Balay case 4: 9509371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(4); 951d7ed1a4dSandi selinger inputcol += L1_rowsleft; 952d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 953d7ed1a4dSandi selinger L1_rowsleft = 0; 954d7ed1a4dSandi selinger break; 9559371c9d4SSatish Balay case 5: 9569371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(5); 957d7ed1a4dSandi selinger inputcol += L1_rowsleft; 958d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 959d7ed1a4dSandi selinger L1_rowsleft = 0; 960d7ed1a4dSandi selinger break; 9619371c9d4SSatish Balay case 6: 9629371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(6); 963d7ed1a4dSandi selinger inputcol += L1_rowsleft; 964d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 965d7ed1a4dSandi selinger L1_rowsleft = 0; 966d7ed1a4dSandi selinger break; 9679371c9d4SSatish Balay case 7: 9689371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(7); 969d7ed1a4dSandi selinger inputcol += L1_rowsleft; 970d7ed1a4dSandi selinger rowsleft -= L1_rowsleft; 971d7ed1a4dSandi selinger L1_rowsleft = 0; 972d7ed1a4dSandi selinger break; 9739371c9d4SSatish Balay default: 9749371c9d4SSatish Balay MatMatMultSymbolic_RowMergeMacro(8); 975d7ed1a4dSandi selinger inputcol += 8; 976d7ed1a4dSandi selinger rowsleft -= 8; 977d7ed1a4dSandi selinger L1_rowsleft -= 8; 978d7ed1a4dSandi selinger break; 979d7ed1a4dSandi selinger } 980d7ed1a4dSandi selinger inputcol_L1 = inputcol; 9817660c4dbSandi selinger L1_nnz += outputi_nnz; 9827660c4dbSandi selinger worki_L1[++L1_nrows] = L1_nnz; 983d7ed1a4dSandi selinger } 984d7ed1a4dSandi selinger 985d7ed1a4dSandi selinger /********************** L E V E L 2 ************************/ 986d7ed1a4dSandi selinger /* Merge from L1 work array to either C or to L2 work array */ 987d7ed1a4dSandi selinger if (anzi > 8) { 988d7ed1a4dSandi selinger inputi = worki_L1; 989d7ed1a4dSandi selinger inputj = workj_L1; 990d7ed1a4dSandi selinger inputcol = workcol; 991d7ed1a4dSandi selinger outputi_nnz = 0; 992d7ed1a4dSandi selinger 993d7ed1a4dSandi selinger if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 994d7ed1a4dSandi selinger else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 995d7ed1a4dSandi selinger 996d7ed1a4dSandi selinger switch (L1_nrows) { 9979371c9d4SSatish Balay case 1: 9989371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 999d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1000d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1001d7ed1a4dSandi selinger break; 1002d71ae5a4SJacob Faibussowitsch case 2: 1003d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(2); 1004d71ae5a4SJacob Faibussowitsch break; 1005d71ae5a4SJacob Faibussowitsch case 3: 1006d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(3); 1007d71ae5a4SJacob Faibussowitsch break; 1008d71ae5a4SJacob Faibussowitsch case 4: 1009d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(4); 1010d71ae5a4SJacob Faibussowitsch break; 1011d71ae5a4SJacob Faibussowitsch case 5: 1012d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(5); 1013d71ae5a4SJacob Faibussowitsch break; 1014d71ae5a4SJacob Faibussowitsch case 6: 1015d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(6); 1016d71ae5a4SJacob Faibussowitsch break; 1017d71ae5a4SJacob Faibussowitsch case 7: 1018d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(7); 1019d71ae5a4SJacob Faibussowitsch break; 1020d71ae5a4SJacob Faibussowitsch case 8: 1021d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(8); 1022d71ae5a4SJacob Faibussowitsch break; 1023d71ae5a4SJacob Faibussowitsch default: 1024d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1025d7ed1a4dSandi selinger } 1026d7ed1a4dSandi selinger L2_nnz += outputi_nnz; 1027d7ed1a4dSandi selinger worki_L2[++L2_nrows] = L2_nnz; 1028d7ed1a4dSandi selinger 10297660c4dbSandi selinger /************************ L E V E L 3 **********************/ 10307660c4dbSandi selinger /* Merge from L2 work array to either C or to L2 work array */ 1031d7ed1a4dSandi selinger if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1032d7ed1a4dSandi selinger inputi = worki_L2; 1033d7ed1a4dSandi selinger inputj = workj_L2; 1034d7ed1a4dSandi selinger inputcol = workcol; 1035d7ed1a4dSandi selinger outputi_nnz = 0; 1036f83700f2Sandi selinger if (rowsleft) outputj = workj_L3; 1037d7ed1a4dSandi selinger else outputj = cj + ci_nnz; 1038d7ed1a4dSandi selinger switch (L2_nrows) { 10399371c9d4SSatish Balay case 1: 10409371c9d4SSatish Balay brow_ptr[0] = inputj + inputi[inputcol[0]]; 1041d7ed1a4dSandi selinger brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1042d7ed1a4dSandi selinger for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1043d7ed1a4dSandi selinger break; 1044d71ae5a4SJacob Faibussowitsch case 2: 1045d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(2); 1046d71ae5a4SJacob Faibussowitsch break; 1047d71ae5a4SJacob Faibussowitsch case 3: 1048d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(3); 1049d71ae5a4SJacob Faibussowitsch break; 1050d71ae5a4SJacob Faibussowitsch case 4: 1051d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(4); 1052d71ae5a4SJacob Faibussowitsch break; 1053d71ae5a4SJacob Faibussowitsch case 5: 1054d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(5); 1055d71ae5a4SJacob Faibussowitsch break; 1056d71ae5a4SJacob Faibussowitsch case 6: 1057d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(6); 1058d71ae5a4SJacob Faibussowitsch break; 1059d71ae5a4SJacob Faibussowitsch case 7: 1060d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(7); 1061d71ae5a4SJacob Faibussowitsch break; 1062d71ae5a4SJacob Faibussowitsch case 8: 1063d71ae5a4SJacob Faibussowitsch MatMatMultSymbolic_RowMergeMacro(8); 1064d71ae5a4SJacob Faibussowitsch break; 1065d71ae5a4SJacob Faibussowitsch default: 1066d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1067d7ed1a4dSandi selinger } 1068d7ed1a4dSandi selinger L2_nrows = 1; 10697660c4dbSandi selinger L2_nnz = outputi_nnz; 10707660c4dbSandi selinger worki_L2[1] = outputi_nnz; 10717660c4dbSandi selinger /* Copy to workj_L2 */ 1072d7ed1a4dSandi selinger if (rowsleft) { 10737660c4dbSandi selinger for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1074d7ed1a4dSandi selinger } 1075d7ed1a4dSandi selinger } 1076d7ed1a4dSandi selinger } 1077d7ed1a4dSandi selinger } /* while (rowsleft) */ 1078d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro 1079d7ed1a4dSandi selinger 1080d7ed1a4dSandi selinger /* terminate current row */ 1081d7ed1a4dSandi selinger ci_nnz += outputi_nnz; 1082d7ed1a4dSandi selinger ci[i + 1] = ci_nnz; 1083d7ed1a4dSandi selinger } 1084d7ed1a4dSandi selinger 1085d7ed1a4dSandi selinger /* Step 3: Create the new symbolic matrix */ 10869566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 10879566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1088d7ed1a4dSandi selinger 1089d7ed1a4dSandi selinger /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1090d7ed1a4dSandi selinger /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 10914222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 1092d7ed1a4dSandi selinger c->free_a = PETSC_TRUE; 1093d7ed1a4dSandi selinger c->free_ij = PETSC_TRUE; 1094d7ed1a4dSandi selinger c->nonew = 0; 1095d7ed1a4dSandi selinger 10964222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1097d7ed1a4dSandi selinger 1098d7ed1a4dSandi selinger /* set MatInfo */ 1099d7ed1a4dSandi selinger afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1100d7ed1a4dSandi selinger if (afill < 1.0) afill = 1.0; 11014222ddf1SHong Zhang C->info.mallocs = ndouble; 11024222ddf1SHong Zhang C->info.fill_ratio_given = fill; 11034222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1104d7ed1a4dSandi selinger 1105d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO) 1106d7ed1a4dSandi selinger if (ci[am]) { 11079566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 11089566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1109d7ed1a4dSandi selinger } else { 11109566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 1111d7ed1a4dSandi selinger } 1112d7ed1a4dSandi selinger #endif 1113d7ed1a4dSandi selinger 1114d7ed1a4dSandi selinger /* Step 4: Free temporary work areas */ 11159566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L1)); 11169566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L2)); 11179566063dSJacob Faibussowitsch PetscCall(PetscFree(workj_L3)); 1118d7ed1a4dSandi selinger PetscFunctionReturn(0); 1119d7ed1a4dSandi selinger } 1120d7ed1a4dSandi selinger 1121cd093f1dSJed Brown /* concatenate unique entries and then sort */ 1122d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1123d71ae5a4SJacob Faibussowitsch { 1124cd093f1dSJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1125cd093f1dSJed Brown const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 11268c78258cSHong Zhang PetscInt *ci, *cj, bcol; 1127cd093f1dSJed Brown PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1128cd093f1dSJed Brown PetscReal afill; 1129cd093f1dSJed Brown PetscInt i, j, ndouble = 0; 1130cd093f1dSJed Brown PetscSegBuffer seg, segrow; 1131cd093f1dSJed Brown char *seen; 1132cd093f1dSJed Brown 1133cd093f1dSJed Brown PetscFunctionBegin; 11349566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ci)); 1135cd093f1dSJed Brown ci[0] = 0; 1136cd093f1dSJed Brown 1137cd093f1dSJed Brown /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 11389566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 11399566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 11409566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(bn, &seen)); 1141cd093f1dSJed Brown 1142cd093f1dSJed Brown /* Determine ci and cj */ 1143cd093f1dSJed Brown for (i = 0; i < am; i++) { 1144cd093f1dSJed Brown const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1145cd093f1dSJed Brown const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1146cd093f1dSJed Brown PetscInt packlen = 0, *PETSC_RESTRICT crow; 11478c78258cSHong Zhang 1148cd093f1dSJed Brown /* Pack segrow */ 1149cd093f1dSJed Brown for (j = 0; j < anzi; j++) { 1150cd093f1dSJed Brown PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1151cd093f1dSJed Brown for (k = bjstart; k < bjend; k++) { 11528c78258cSHong Zhang bcol = bj[k]; 1153cd093f1dSJed Brown if (!seen[bcol]) { /* new entry */ 1154cd093f1dSJed Brown PetscInt *PETSC_RESTRICT slot; 11559566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1156cd093f1dSJed Brown *slot = bcol; 1157cd093f1dSJed Brown seen[bcol] = 1; 1158cd093f1dSJed Brown packlen++; 1159cd093f1dSJed Brown } 1160cd093f1dSJed Brown } 1161cd093f1dSJed Brown } 11628c78258cSHong Zhang 11638c78258cSHong Zhang /* Check i-th diagonal entry */ 11648c78258cSHong Zhang if (C->force_diagonals && !seen[i]) { 11658c78258cSHong Zhang PetscInt *PETSC_RESTRICT slot; 11669566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 11678c78258cSHong Zhang *slot = i; 11688c78258cSHong Zhang seen[i] = 1; 11698c78258cSHong Zhang packlen++; 11708c78258cSHong Zhang } 11718c78258cSHong Zhang 11729566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 11739566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractTo(segrow, crow)); 11749566063dSJacob Faibussowitsch PetscCall(PetscSortInt(packlen, crow)); 1175cd093f1dSJed Brown ci[i + 1] = ci[i] + packlen; 1176cd093f1dSJed Brown for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1177cd093f1dSJed Brown } 11789566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segrow)); 11799566063dSJacob Faibussowitsch PetscCall(PetscFree(seen)); 1180cd093f1dSJed Brown 1181cd093f1dSJed Brown /* Column indices are in the segmented buffer */ 11829566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 11839566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&seg)); 1184cd093f1dSJed Brown 1185cd093f1dSJed Brown /* put together the new symbolic matrix */ 11869566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 11879566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1188cd093f1dSJed Brown 1189cd093f1dSJed Brown /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1190cd093f1dSJed Brown /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 11914222ddf1SHong Zhang c = (Mat_SeqAIJ *)(C->data); 1192cd093f1dSJed Brown c->free_a = PETSC_TRUE; 1193cd093f1dSJed Brown c->free_ij = PETSC_TRUE; 1194cd093f1dSJed Brown c->nonew = 0; 1195cd093f1dSJed Brown 11964222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1197cd093f1dSJed Brown 1198cd093f1dSJed Brown /* set MatInfo */ 11992a09556fSStefano Zampini afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1200cd093f1dSJed Brown if (afill < 1.0) afill = 1.0; 12014222ddf1SHong Zhang C->info.mallocs = ndouble; 12024222ddf1SHong Zhang C->info.fill_ratio_given = fill; 12034222ddf1SHong Zhang C->info.fill_ratio_needed = afill; 1204cd093f1dSJed Brown 1205cd093f1dSJed Brown #if defined(PETSC_USE_INFO) 1206cd093f1dSJed Brown if (ci[am]) { 12079566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 12089566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1209cd093f1dSJed Brown } else { 12109566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Empty matrix product\n")); 1211cd093f1dSJed Brown } 1212cd093f1dSJed Brown #endif 1213cd093f1dSJed Brown PetscFunctionReturn(0); 1214cd093f1dSJed Brown } 1215cd093f1dSJed Brown 1216d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1217d71ae5a4SJacob Faibussowitsch { 12186718818eSStefano Zampini Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 12192128a86cSHong Zhang 12202128a86cSHong Zhang PetscFunctionBegin; 12219566063dSJacob Faibussowitsch PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 12229566063dSJacob Faibussowitsch PetscCall(MatDestroy(&abt->Bt_den)); 12239566063dSJacob Faibussowitsch PetscCall(MatDestroy(&abt->ABt_den)); 12249566063dSJacob Faibussowitsch PetscCall(PetscFree(abt)); 12252128a86cSHong Zhang PetscFunctionReturn(0); 12262128a86cSHong Zhang } 12272128a86cSHong Zhang 1228d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1229d71ae5a4SJacob Faibussowitsch { 123081d82fe4SHong Zhang Mat Bt; 123140192850SHong Zhang Mat_MatMatTransMult *abt; 12324222ddf1SHong Zhang Mat_Product *product = C->product; 12336718818eSStefano Zampini char *alg; 1234d2853540SHong Zhang 123581d82fe4SHong Zhang PetscFunctionBegin; 123628b400f6SJacob Faibussowitsch PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 123728b400f6SJacob Faibussowitsch PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 12386718818eSStefano Zampini 123981d82fe4SHong Zhang /* create symbolic Bt */ 12407fb60732SBarry Smith PetscCall(MatTransposeSymbolic(B, &Bt)); 12419566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 12429566063dSJacob Faibussowitsch PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 124381d82fe4SHong Zhang 124481d82fe4SHong Zhang /* get symbolic C=A*Bt */ 12459566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(product->alg, &alg)); 12469566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 12479566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 12489566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 12499566063dSJacob Faibussowitsch PetscCall(PetscFree(alg)); 125081d82fe4SHong Zhang 1251a5b23f4aSJose E. Roman /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 12529566063dSJacob Faibussowitsch PetscCall(PetscNew(&abt)); 12532128a86cSHong Zhang 12546718818eSStefano Zampini product->data = abt; 12556718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 12566718818eSStefano Zampini 12574222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 12582128a86cSHong Zhang 12594222ddf1SHong Zhang abt->usecoloring = PETSC_FALSE; 12609566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 126140192850SHong Zhang if (abt->usecoloring) { 1262b9af6bddSHong Zhang /* Create MatTransposeColoring from symbolic C=A*B^T */ 1263b9af6bddSHong Zhang MatTransposeColoring matcoloring; 1264335efc43SPeter Brune MatColoring coloring; 12652128a86cSHong Zhang ISColoring iscoloring; 12662128a86cSHong Zhang Mat Bt_dense, C_dense; 1267e8354b3eSHong Zhang 12684222ddf1SHong Zhang /* inode causes memory problem */ 12699566063dSJacob Faibussowitsch PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 12704222ddf1SHong Zhang 12719566063dSJacob Faibussowitsch PetscCall(MatColoringCreate(C, &coloring)); 12729566063dSJacob Faibussowitsch PetscCall(MatColoringSetDistance(coloring, 2)); 12739566063dSJacob Faibussowitsch PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 12749566063dSJacob Faibussowitsch PetscCall(MatColoringSetFromOptions(coloring)); 12759566063dSJacob Faibussowitsch PetscCall(MatColoringApply(coloring, &iscoloring)); 12769566063dSJacob Faibussowitsch PetscCall(MatColoringDestroy(&coloring)); 12779566063dSJacob Faibussowitsch PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 12782205254eSKarl Rupp 127940192850SHong Zhang abt->matcoloring = matcoloring; 12802205254eSKarl Rupp 12819566063dSJacob Faibussowitsch PetscCall(ISColoringDestroy(&iscoloring)); 12822128a86cSHong Zhang 12832128a86cSHong Zhang /* Create Bt_dense and C_dense = A*Bt_dense */ 12849566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 12859566063dSJacob Faibussowitsch PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 12869566063dSJacob Faibussowitsch PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 12879566063dSJacob Faibussowitsch PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 12882205254eSKarl Rupp 12892128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 129040192850SHong Zhang abt->Bt_den = Bt_dense; 12912128a86cSHong Zhang 12929566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 12939566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 12949566063dSJacob Faibussowitsch PetscCall(MatSetType(C_dense, MATSEQDENSE)); 12959566063dSJacob Faibussowitsch PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 12962205254eSKarl Rupp 12972128a86cSHong Zhang Bt_dense->assembled = PETSC_TRUE; 129840192850SHong Zhang abt->ABt_den = C_dense; 1299f94ccd6cSHong Zhang 1300f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO) 13011ce71dffSSatish Balay { 13024222ddf1SHong Zhang Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 13039371c9d4SSatish Balay PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 13049371c9d4SSatish Balay Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 13051ce71dffSSatish Balay } 1306f94ccd6cSHong Zhang #endif 13072128a86cSHong Zhang } 1308e99be685SHong Zhang /* clean up */ 13099566063dSJacob Faibussowitsch PetscCall(MatDestroy(&Bt)); 13105df89d91SHong Zhang PetscFunctionReturn(0); 13115df89d91SHong Zhang } 13125df89d91SHong Zhang 1313d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1314d71ae5a4SJacob Faibussowitsch { 13155df89d91SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1316e2cac8caSJed Brown PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 131781d82fe4SHong Zhang PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 13185df89d91SHong Zhang PetscLogDouble flops = 0.0; 1319aa1aec99SHong Zhang MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 13206718818eSStefano Zampini Mat_MatMatTransMult *abt; 13216718818eSStefano Zampini Mat_Product *product = C->product; 13225df89d91SHong Zhang 13235df89d91SHong Zhang PetscFunctionBegin; 132428b400f6SJacob Faibussowitsch PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 13256718818eSStefano Zampini abt = (Mat_MatMatTransMult *)product->data; 132628b400f6SJacob Faibussowitsch PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 132758ed3ceaSHong Zhang /* clear old values in C */ 132858ed3ceaSHong Zhang if (!c->a) { 13299566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 133058ed3ceaSHong Zhang c->a = ca; 133158ed3ceaSHong Zhang c->free_a = PETSC_TRUE; 133258ed3ceaSHong Zhang } else { 133358ed3ceaSHong Zhang ca = c->a; 13349566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 133558ed3ceaSHong Zhang } 133658ed3ceaSHong Zhang 133740192850SHong Zhang if (abt->usecoloring) { 133840192850SHong Zhang MatTransposeColoring matcoloring = abt->matcoloring; 133940192850SHong Zhang Mat Bt_dense, C_dense = abt->ABt_den; 1340c8db22f6SHong Zhang 1341b9af6bddSHong Zhang /* Get Bt_dense by Apply MatTransposeColoring to B */ 134240192850SHong Zhang Bt_dense = abt->Bt_den; 13439566063dSJacob Faibussowitsch PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1344c8db22f6SHong Zhang 1345c8db22f6SHong Zhang /* C_dense = A*Bt_dense */ 13469566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1347c8db22f6SHong Zhang 13482128a86cSHong Zhang /* Recover C from C_dense */ 13499566063dSJacob Faibussowitsch PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1350c8db22f6SHong Zhang PetscFunctionReturn(0); 1351c8db22f6SHong Zhang } 1352c8db22f6SHong Zhang 13531fa1209cSHong Zhang for (i = 0; i < cm; i++) { 135481d82fe4SHong Zhang anzi = ai[i + 1] - ai[i]; 135581d82fe4SHong Zhang acol = aj + ai[i]; 13566973769bSHong Zhang aval = aa + ai[i]; 13571fa1209cSHong Zhang cnzi = ci[i + 1] - ci[i]; 13581fa1209cSHong Zhang ccol = cj + ci[i]; 13596973769bSHong Zhang cval = ca + ci[i]; 13601fa1209cSHong Zhang for (j = 0; j < cnzi; j++) { 136181d82fe4SHong Zhang brow = ccol[j]; 136281d82fe4SHong Zhang bnzj = bi[brow + 1] - bi[brow]; 136381d82fe4SHong Zhang bcol = bj + bi[brow]; 13646973769bSHong Zhang bval = ba + bi[brow]; 13656973769bSHong Zhang 136681d82fe4SHong Zhang /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 13679371c9d4SSatish Balay nexta = 0; 13689371c9d4SSatish Balay nextb = 0; 136981d82fe4SHong Zhang while (nexta < anzi && nextb < bnzj) { 13707b6d5e96SMark Adams while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 137181d82fe4SHong Zhang if (nexta == anzi) break; 13727b6d5e96SMark Adams while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 137381d82fe4SHong Zhang if (nextb == bnzj) break; 137481d82fe4SHong Zhang if (acol[nexta] == bcol[nextb]) { 13756973769bSHong Zhang cval[j] += aval[nexta] * bval[nextb]; 13769371c9d4SSatish Balay nexta++; 13779371c9d4SSatish Balay nextb++; 137881d82fe4SHong Zhang flops += 2; 13791fa1209cSHong Zhang } 13801fa1209cSHong Zhang } 138181d82fe4SHong Zhang } 138281d82fe4SHong Zhang } 13839566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 13849566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 13859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 13865df89d91SHong Zhang PetscFunctionReturn(0); 13875df89d91SHong Zhang } 13885df89d91SHong Zhang 1389d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1390d71ae5a4SJacob Faibussowitsch { 13916718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 13926d373c3eSHong Zhang 13936d373c3eSHong Zhang PetscFunctionBegin; 13949566063dSJacob Faibussowitsch PetscCall(MatDestroy(&atb->At)); 13951baa6e33SBarry Smith if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 13969566063dSJacob Faibussowitsch PetscCall(PetscFree(atb)); 13976d373c3eSHong Zhang PetscFunctionReturn(0); 13986d373c3eSHong Zhang } 13996d373c3eSHong Zhang 1400d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1401d71ae5a4SJacob Faibussowitsch { 1402089a957eSStefano Zampini Mat At = NULL; 14034222ddf1SHong Zhang Mat_Product *product = C->product; 1404089a957eSStefano Zampini PetscBool flg, def, square; 1405bc011b1eSHong Zhang 1406bc011b1eSHong Zhang PetscFunctionBegin; 1407089a957eSStefano Zampini MatCheckProduct(C, 4); 1408b94d7dedSBarry Smith square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 14094222ddf1SHong Zhang /* outerproduct */ 14109566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 14114222ddf1SHong Zhang if (flg) { 1412bc011b1eSHong Zhang /* create symbolic At */ 1413089a957eSStefano Zampini if (!square) { 14147fb60732SBarry Smith PetscCall(MatTransposeSymbolic(A, &At)); 14159566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 14169566063dSJacob Faibussowitsch PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1417089a957eSStefano Zampini } 1418bc011b1eSHong Zhang /* get symbolic C=At*B */ 14199566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); 14209566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1421bc011b1eSHong Zhang 1422bc011b1eSHong Zhang /* clean up */ 142348a46eb9SPierre Jolivet if (!square) PetscCall(MatDestroy(&At)); 14246d373c3eSHong Zhang 14254222ddf1SHong Zhang C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 14269566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 14274222ddf1SHong Zhang PetscFunctionReturn(0); 14284222ddf1SHong Zhang } 14294222ddf1SHong Zhang 14304222ddf1SHong Zhang /* matmatmult */ 14319566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &def)); 14329566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1433089a957eSStefano Zampini if (flg || def) { 14344222ddf1SHong Zhang Mat_MatTransMatMult *atb; 14354222ddf1SHong Zhang 143628b400f6SJacob Faibussowitsch PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 14379566063dSJacob Faibussowitsch PetscCall(PetscNew(&atb)); 143848a46eb9SPierre Jolivet if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 14399566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "sorted")); 14409566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 14419566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, "at*b")); 14426718818eSStefano Zampini product->data = atb; 14436718818eSStefano Zampini product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 14444222ddf1SHong Zhang atb->At = At; 14454222ddf1SHong Zhang 14464222ddf1SHong Zhang C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 14474222ddf1SHong Zhang PetscFunctionReturn(0); 14484222ddf1SHong Zhang } 14494222ddf1SHong Zhang 14504222ddf1SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1451bc011b1eSHong Zhang } 1452bc011b1eSHong Zhang 1453d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1454d71ae5a4SJacob Faibussowitsch { 14550fbc74f4SHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1456d0f46423SBarry Smith PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1457d0f46423SBarry Smith PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1458d13dce4bSSatish Balay PetscLogDouble flops = 0.0; 1459aa1aec99SHong Zhang MatScalar *aa = a->a, *ba, *ca, *caj; 1460bc011b1eSHong Zhang 1461bc011b1eSHong Zhang PetscFunctionBegin; 1462aa1aec99SHong Zhang if (!c->a) { 14639566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 14642205254eSKarl Rupp 1465aa1aec99SHong Zhang c->a = ca; 1466aa1aec99SHong Zhang c->free_a = PETSC_TRUE; 1467aa1aec99SHong Zhang } else { 1468aa1aec99SHong Zhang ca = c->a; 14699566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(ca, ci[cm])); 1470aa1aec99SHong Zhang } 1471bc011b1eSHong Zhang 1472bc011b1eSHong Zhang /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1473bc011b1eSHong Zhang for (i = 0; i < am; i++) { 1474bc011b1eSHong Zhang bj = b->j + bi[i]; 1475bc011b1eSHong Zhang ba = b->a + bi[i]; 1476bc011b1eSHong Zhang bnzi = bi[i + 1] - bi[i]; 1477bc011b1eSHong Zhang anzi = ai[i + 1] - ai[i]; 1478bc011b1eSHong Zhang for (j = 0; j < anzi; j++) { 1479bc011b1eSHong Zhang nextb = 0; 14800fbc74f4SHong Zhang crow = *aj++; 1481bc011b1eSHong Zhang cjj = cj + ci[crow]; 1482bc011b1eSHong Zhang caj = ca + ci[crow]; 1483bc011b1eSHong Zhang /* perform sparse axpy operation. Note cjj includes bj. */ 1484bc011b1eSHong Zhang for (k = 0; nextb < bnzi; k++) { 14850fbc74f4SHong Zhang if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 14860fbc74f4SHong Zhang caj[k] += (*aa) * (*(ba + nextb)); 1487bc011b1eSHong Zhang nextb++; 1488bc011b1eSHong Zhang } 1489bc011b1eSHong Zhang } 1490bc011b1eSHong Zhang flops += 2 * bnzi; 14910fbc74f4SHong Zhang aa++; 1492bc011b1eSHong Zhang } 1493bc011b1eSHong Zhang } 1494bc011b1eSHong Zhang 1495bc011b1eSHong Zhang /* Assemble the final matrix and clean up */ 14969566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 14979566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 14989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 1499bc011b1eSHong Zhang PetscFunctionReturn(0); 1500bc011b1eSHong Zhang } 15019123193aSHong Zhang 1502d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1503d71ae5a4SJacob Faibussowitsch { 15049123193aSHong Zhang PetscFunctionBegin; 15059566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 15064222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 15079123193aSHong Zhang PetscFunctionReturn(0); 15089123193aSHong Zhang } 15099123193aSHong Zhang 1510d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1511d71ae5a4SJacob Faibussowitsch { 1512f32d5d43SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 15131ca9667aSStefano Zampini PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1514a4af7ca8SStefano Zampini const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1515daf57211SHong Zhang const PetscInt *aj; 151675f6d85dSStefano Zampini PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 151775f6d85dSStefano Zampini PetscInt clda; 151875f6d85dSStefano Zampini PetscInt am4, bm4, col, i, j, n; 15199123193aSHong Zhang 15209123193aSHong Zhang PetscFunctionBegin; 1521f32d5d43SBarry Smith if (!cm || !cn) PetscFunctionReturn(0); 15229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetArrayRead(A, &av)); 152393aa15f2SStefano Zampini if (add) { 15249566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C, &c)); 152593aa15f2SStefano Zampini } else { 15269566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayWrite(C, &c)); 152793aa15f2SStefano Zampini } 15289566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayRead(B, &b)); 15299566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &bm)); 15309566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 153175f6d85dSStefano Zampini am4 = 4 * clda; 153275f6d85dSStefano Zampini bm4 = 4 * bm; 15339371c9d4SSatish Balay b1 = b; 15349371c9d4SSatish Balay b2 = b1 + bm; 15359371c9d4SSatish Balay b3 = b2 + bm; 15369371c9d4SSatish Balay b4 = b3 + bm; 15379371c9d4SSatish Balay c1 = c; 15389371c9d4SSatish Balay c2 = c1 + clda; 15399371c9d4SSatish Balay c3 = c2 + clda; 15409371c9d4SSatish Balay c4 = c3 + clda; 15411ca9667aSStefano Zampini for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 15421ca9667aSStefano Zampini for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1543f32d5d43SBarry Smith r1 = r2 = r3 = r4 = 0.0; 1544f32d5d43SBarry Smith n = a->i[i + 1] - a->i[i]; 1545f32d5d43SBarry Smith aj = a->j + a->i[i]; 1546a4af7ca8SStefano Zampini aa = av + a->i[i]; 1547f32d5d43SBarry Smith for (j = 0; j < n; j++) { 15481ca9667aSStefano Zampini const PetscScalar aatmp = aa[j]; 15491ca9667aSStefano Zampini const PetscInt ajtmp = aj[j]; 1550730858b9SHong Zhang r1 += aatmp * b1[ajtmp]; 1551730858b9SHong Zhang r2 += aatmp * b2[ajtmp]; 1552730858b9SHong Zhang r3 += aatmp * b3[ajtmp]; 1553730858b9SHong Zhang r4 += aatmp * b4[ajtmp]; 15549123193aSHong Zhang } 155593aa15f2SStefano Zampini if (add) { 155687753ddeSHong Zhang c1[i] += r1; 155787753ddeSHong Zhang c2[i] += r2; 155887753ddeSHong Zhang c3[i] += r3; 155987753ddeSHong Zhang c4[i] += r4; 156093aa15f2SStefano Zampini } else { 156193aa15f2SStefano Zampini c1[i] = r1; 156293aa15f2SStefano Zampini c2[i] = r2; 156393aa15f2SStefano Zampini c3[i] = r3; 156493aa15f2SStefano Zampini c4[i] = r4; 156593aa15f2SStefano Zampini } 1566f32d5d43SBarry Smith } 15679371c9d4SSatish Balay b1 += bm4; 15689371c9d4SSatish Balay b2 += bm4; 15699371c9d4SSatish Balay b3 += bm4; 15709371c9d4SSatish Balay b4 += bm4; 15719371c9d4SSatish Balay c1 += am4; 15729371c9d4SSatish Balay c2 += am4; 15739371c9d4SSatish Balay c3 += am4; 15749371c9d4SSatish Balay c4 += am4; 1575f32d5d43SBarry Smith } 157693aa15f2SStefano Zampini /* process remaining columns */ 157793aa15f2SStefano Zampini if (col != cn) { 157893aa15f2SStefano Zampini PetscInt rc = cn - col; 157993aa15f2SStefano Zampini 158093aa15f2SStefano Zampini if (rc == 1) { 158193aa15f2SStefano Zampini for (i = 0; i < am; i++) { 1582f32d5d43SBarry Smith r1 = 0.0; 1583f32d5d43SBarry Smith n = a->i[i + 1] - a->i[i]; 1584f32d5d43SBarry Smith aj = a->j + a->i[i]; 1585a4af7ca8SStefano Zampini aa = av + a->i[i]; 158693aa15f2SStefano Zampini for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 158793aa15f2SStefano Zampini if (add) c1[i] += r1; 158893aa15f2SStefano Zampini else c1[i] = r1; 158993aa15f2SStefano Zampini } 159093aa15f2SStefano Zampini } else if (rc == 2) { 159193aa15f2SStefano Zampini for (i = 0; i < am; i++) { 159293aa15f2SStefano Zampini r1 = r2 = 0.0; 159393aa15f2SStefano Zampini n = a->i[i + 1] - a->i[i]; 159493aa15f2SStefano Zampini aj = a->j + a->i[i]; 159593aa15f2SStefano Zampini aa = av + a->i[i]; 1596f32d5d43SBarry Smith for (j = 0; j < n; j++) { 159793aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 159893aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 159993aa15f2SStefano Zampini r1 += aatmp * b1[ajtmp]; 160093aa15f2SStefano Zampini r2 += aatmp * b2[ajtmp]; 1601f32d5d43SBarry Smith } 160293aa15f2SStefano Zampini if (add) { 160387753ddeSHong Zhang c1[i] += r1; 160493aa15f2SStefano Zampini c2[i] += r2; 160593aa15f2SStefano Zampini } else { 160693aa15f2SStefano Zampini c1[i] = r1; 160793aa15f2SStefano Zampini c2[i] = r2; 1608f32d5d43SBarry Smith } 160993aa15f2SStefano Zampini } 161093aa15f2SStefano Zampini } else { 161193aa15f2SStefano Zampini for (i = 0; i < am; i++) { 161293aa15f2SStefano Zampini r1 = r2 = r3 = 0.0; 161393aa15f2SStefano Zampini n = a->i[i + 1] - a->i[i]; 161493aa15f2SStefano Zampini aj = a->j + a->i[i]; 161593aa15f2SStefano Zampini aa = av + a->i[i]; 161693aa15f2SStefano Zampini for (j = 0; j < n; j++) { 161793aa15f2SStefano Zampini const PetscScalar aatmp = aa[j]; 161893aa15f2SStefano Zampini const PetscInt ajtmp = aj[j]; 161993aa15f2SStefano Zampini r1 += aatmp * b1[ajtmp]; 162093aa15f2SStefano Zampini r2 += aatmp * b2[ajtmp]; 162193aa15f2SStefano Zampini r3 += aatmp * b3[ajtmp]; 162293aa15f2SStefano Zampini } 162393aa15f2SStefano Zampini if (add) { 162493aa15f2SStefano Zampini c1[i] += r1; 162593aa15f2SStefano Zampini c2[i] += r2; 162693aa15f2SStefano Zampini c3[i] += r3; 162793aa15f2SStefano Zampini } else { 162893aa15f2SStefano Zampini c1[i] = r1; 162993aa15f2SStefano Zampini c2[i] = r2; 163093aa15f2SStefano Zampini c3[i] = r3; 163193aa15f2SStefano Zampini } 163293aa15f2SStefano Zampini } 163393aa15f2SStefano Zampini } 1634f32d5d43SBarry Smith } 16359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 163693aa15f2SStefano Zampini if (add) { 16379566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C, &c)); 163893aa15f2SStefano Zampini } else { 16399566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayWrite(C, &c)); 164093aa15f2SStefano Zampini } 16419566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayRead(B, &b)); 16429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1643f32d5d43SBarry Smith PetscFunctionReturn(0); 1644f32d5d43SBarry Smith } 1645f32d5d43SBarry Smith 1646d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1647d71ae5a4SJacob Faibussowitsch { 1648f32d5d43SBarry Smith PetscFunctionBegin; 164908401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 165008401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 165108401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 16524324174eSBarry Smith 16539566063dSJacob Faibussowitsch PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 16549123193aSHong Zhang PetscFunctionReturn(0); 16559123193aSHong Zhang } 1656b1683b59SHong Zhang 16574222ddf1SHong Zhang /* ------------------------------------------------------- */ 1658d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1659d71ae5a4SJacob Faibussowitsch { 16604222ddf1SHong Zhang PetscFunctionBegin; 16614222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 16624222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 16634222ddf1SHong Zhang PetscFunctionReturn(0); 16644222ddf1SHong Zhang } 16654222ddf1SHong Zhang 16666718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 16676718818eSStefano Zampini 1668d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1669d71ae5a4SJacob Faibussowitsch { 16704222ddf1SHong Zhang PetscFunctionBegin; 16716718818eSStefano Zampini C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16724222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB; 16736718818eSStefano Zampini PetscFunctionReturn(0); 16746718818eSStefano Zampini } 16756718818eSStefano Zampini 1676d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1677d71ae5a4SJacob Faibussowitsch { 16786718818eSStefano Zampini PetscFunctionBegin; 16796718818eSStefano Zampini C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 16806718818eSStefano Zampini C->ops->productsymbolic = MatProductSymbolic_ABt; 16814222ddf1SHong Zhang PetscFunctionReturn(0); 16824222ddf1SHong Zhang } 16834222ddf1SHong Zhang 1684d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1685d71ae5a4SJacob Faibussowitsch { 16864222ddf1SHong Zhang Mat_Product *product = C->product; 16874222ddf1SHong Zhang 16884222ddf1SHong Zhang PetscFunctionBegin; 16894222ddf1SHong Zhang switch (product->type) { 1690d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB: 1691d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1692d71ae5a4SJacob Faibussowitsch break; 1693d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB: 1694d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1695d71ae5a4SJacob Faibussowitsch break; 1696d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 1697d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1698d71ae5a4SJacob Faibussowitsch break; 1699d71ae5a4SJacob Faibussowitsch default: 1700d71ae5a4SJacob Faibussowitsch break; 17014222ddf1SHong Zhang } 17024222ddf1SHong Zhang PetscFunctionReturn(0); 17034222ddf1SHong Zhang } 17044222ddf1SHong Zhang /* ------------------------------------------------------- */ 1705d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1706d71ae5a4SJacob Faibussowitsch { 17074222ddf1SHong Zhang Mat_Product *product = C->product; 17084222ddf1SHong Zhang Mat A = product->A; 17094222ddf1SHong Zhang PetscBool baij; 17104222ddf1SHong Zhang 17114222ddf1SHong Zhang PetscFunctionBegin; 17129566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 17134222ddf1SHong Zhang if (!baij) { /* A is seqsbaij */ 17144222ddf1SHong Zhang PetscBool sbaij; 17159566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 171628b400f6SJacob Faibussowitsch PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 17174222ddf1SHong Zhang 17184222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 17194222ddf1SHong Zhang } else { /* A is seqbaij */ 17204222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 17214222ddf1SHong Zhang } 17224222ddf1SHong Zhang 17234222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17244222ddf1SHong Zhang PetscFunctionReturn(0); 17254222ddf1SHong Zhang } 17264222ddf1SHong Zhang 1727d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1728d71ae5a4SJacob Faibussowitsch { 17294222ddf1SHong Zhang Mat_Product *product = C->product; 17304222ddf1SHong Zhang 17314222ddf1SHong Zhang PetscFunctionBegin; 17326718818eSStefano Zampini MatCheckProduct(C, 1); 173328b400f6SJacob Faibussowitsch PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1734b94d7dedSBarry Smith if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 17354222ddf1SHong Zhang PetscFunctionReturn(0); 17364222ddf1SHong Zhang } 17376718818eSStefano Zampini 17384222ddf1SHong Zhang /* ------------------------------------------------------- */ 1739d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1740d71ae5a4SJacob Faibussowitsch { 17414222ddf1SHong Zhang PetscFunctionBegin; 17424222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 17434222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 17444222ddf1SHong Zhang PetscFunctionReturn(0); 17454222ddf1SHong Zhang } 17464222ddf1SHong Zhang 1747d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1748d71ae5a4SJacob Faibussowitsch { 17494222ddf1SHong Zhang Mat_Product *product = C->product; 17504222ddf1SHong Zhang 17514222ddf1SHong Zhang PetscFunctionBegin; 175248a46eb9SPierre Jolivet if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 17534222ddf1SHong Zhang PetscFunctionReturn(0); 17544222ddf1SHong Zhang } 17554222ddf1SHong Zhang /* ------------------------------------------------------- */ 17564222ddf1SHong Zhang 1757d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1758d71ae5a4SJacob Faibussowitsch { 17592f5f1f90SHong Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 17602f5f1f90SHong Zhang Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 17612f5f1f90SHong Zhang PetscInt *bi = b->i, *bj = b->j; 17622f5f1f90SHong Zhang PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 17632f5f1f90SHong Zhang MatScalar *btval, *btval_den, *ba = b->a; 17642f5f1f90SHong Zhang PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1765c8db22f6SHong Zhang 1766c8db22f6SHong Zhang PetscFunctionBegin; 17672f5f1f90SHong Zhang btval_den = btdense->v; 17689566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(btval_den, m * n)); 17692f5f1f90SHong Zhang for (k = 0; k < ncolors; k++) { 17702f5f1f90SHong Zhang ncolumns = coloring->ncolumns[k]; 17712f5f1f90SHong Zhang for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1772d2853540SHong Zhang col = *(columns + colorforcol[k] + l); 17732f5f1f90SHong Zhang btcol = bj + bi[col]; 17742f5f1f90SHong Zhang btval = ba + bi[col]; 17752f5f1f90SHong Zhang anz = bi[col + 1] - bi[col]; 1776d2853540SHong Zhang for (j = 0; j < anz; j++) { 17772f5f1f90SHong Zhang brow = btcol[j]; 17782f5f1f90SHong Zhang btval_den[brow] = btval[j]; 1779c8db22f6SHong Zhang } 1780c8db22f6SHong Zhang } 17812f5f1f90SHong Zhang btval_den += m; 1782c8db22f6SHong Zhang } 1783c8db22f6SHong Zhang PetscFunctionReturn(0); 1784c8db22f6SHong Zhang } 1785c8db22f6SHong Zhang 1786d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1787d71ae5a4SJacob Faibussowitsch { 1788b2d2b04fSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 17891683a169SBarry Smith const PetscScalar *ca_den, *ca_den_ptr; 17901683a169SBarry Smith PetscScalar *ca = csp->a; 1791f99a636bSHong Zhang PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1792e88777f2SHong Zhang PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1793077f23c2SHong Zhang PetscInt nrows, *row, *idx; 1794077f23c2SHong Zhang PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 17958972f759SHong Zhang 17968972f759SHong Zhang PetscFunctionBegin; 17979566063dSJacob Faibussowitsch PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1798f99a636bSHong Zhang 1799077f23c2SHong Zhang if (brows > 0) { 1800077f23c2SHong Zhang PetscInt *lstart, row_end, row_start; 1801980ae229SHong Zhang lstart = matcoloring->lstart; 18029566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(lstart, ncolors)); 1803eeb4fd02SHong Zhang 1804077f23c2SHong Zhang row_end = brows; 1805eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1806077f23c2SHong Zhang for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1807077f23c2SHong Zhang ca_den_ptr = ca_den; 1808980ae229SHong Zhang for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1809eeb4fd02SHong Zhang nrows = matcoloring->nrows[k]; 1810eeb4fd02SHong Zhang row = rows + colorforrow[k]; 1811eeb4fd02SHong Zhang idx = den2sp + colorforrow[k]; 1812eeb4fd02SHong Zhang for (l = lstart[k]; l < nrows; l++) { 1813eeb4fd02SHong Zhang if (row[l] >= row_end) { 1814eeb4fd02SHong Zhang lstart[k] = l; 1815eeb4fd02SHong Zhang break; 1816eeb4fd02SHong Zhang } else { 1817077f23c2SHong Zhang ca[idx[l]] = ca_den_ptr[row[l]]; 1818eeb4fd02SHong Zhang } 1819eeb4fd02SHong Zhang } 1820077f23c2SHong Zhang ca_den_ptr += m; 1821eeb4fd02SHong Zhang } 1822077f23c2SHong Zhang row_end += brows; 1823eeb4fd02SHong Zhang if (row_end > m) row_end = m; 1824eeb4fd02SHong Zhang } 1825077f23c2SHong Zhang } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1826077f23c2SHong Zhang ca_den_ptr = ca_den; 1827077f23c2SHong Zhang for (k = 0; k < ncolors; k++) { 1828077f23c2SHong Zhang nrows = matcoloring->nrows[k]; 1829077f23c2SHong Zhang row = rows + colorforrow[k]; 1830077f23c2SHong Zhang idx = den2sp + colorforrow[k]; 1831ad540459SPierre Jolivet for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1832077f23c2SHong Zhang ca_den_ptr += m; 1833077f23c2SHong Zhang } 1834f99a636bSHong Zhang } 1835f99a636bSHong Zhang 18369566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1837f99a636bSHong Zhang #if defined(PETSC_USE_INFO) 1838077f23c2SHong Zhang if (matcoloring->brows > 0) { 18399566063dSJacob Faibussowitsch PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1840e88777f2SHong Zhang } else { 18419566063dSJacob Faibussowitsch PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1842e88777f2SHong Zhang } 1843f99a636bSHong Zhang #endif 18448972f759SHong Zhang PetscFunctionReturn(0); 18458972f759SHong Zhang } 18468972f759SHong Zhang 1847d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1848d71ae5a4SJacob Faibussowitsch { 1849e88777f2SHong Zhang PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 18501a83f524SJed Brown const PetscInt *is, *ci, *cj, *row_idx; 1851b28d80bdSHong Zhang PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1852b1683b59SHong Zhang IS *isa; 1853b28d80bdSHong Zhang Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1854e88777f2SHong Zhang PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1855e88777f2SHong Zhang PetscInt *colorforcol, *columns, *columns_i, brows; 1856e88777f2SHong Zhang PetscBool flg; 1857b1683b59SHong Zhang 1858b1683b59SHong Zhang PetscFunctionBegin; 18599566063dSJacob Faibussowitsch PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1860e99be685SHong Zhang 18617ecccc15SHong Zhang /* bs >1 is not being tested yet! */ 1862e88777f2SHong Zhang Nbs = mat->cmap->N / bs; 1863b1683b59SHong Zhang c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1864e88777f2SHong Zhang c->N = Nbs; 1865e88777f2SHong Zhang c->m = c->M; 1866b1683b59SHong Zhang c->rstart = 0; 1867e88777f2SHong Zhang c->brows = 100; 1868b1683b59SHong Zhang 1869b1683b59SHong Zhang c->ncolors = nis; 18709566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 18719566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 18729566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1873e88777f2SHong Zhang 1874e88777f2SHong Zhang brows = c->brows; 18759566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1876e88777f2SHong Zhang if (flg) c->brows = brows; 187748a46eb9SPierre Jolivet if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 18782205254eSKarl Rupp 1879d2853540SHong Zhang colorforrow[0] = 0; 1880d2853540SHong Zhang rows_i = rows; 1881f99a636bSHong Zhang den2sp_i = den2sp; 1882b1683b59SHong Zhang 18839566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 18849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Nbs + 1, &columns)); 18852205254eSKarl Rupp 1886d2853540SHong Zhang colorforcol[0] = 0; 1887d2853540SHong Zhang columns_i = columns; 1888d2853540SHong Zhang 1889077f23c2SHong Zhang /* get column-wise storage of mat */ 18909566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1891b1683b59SHong Zhang 1892b28d80bdSHong Zhang cm = c->m; 18939566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cm + 1, &rowhit)); 18949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1895f99a636bSHong Zhang for (i = 0; i < nis; i++) { /* loop over color */ 18969566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isa[i], &n)); 18979566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isa[i], &is)); 18982205254eSKarl Rupp 1899b1683b59SHong Zhang c->ncolumns[i] = n; 19001baa6e33SBarry Smith if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1901d2853540SHong Zhang colorforcol[i + 1] = colorforcol[i] + n; 1902d2853540SHong Zhang columns_i += n; 1903b1683b59SHong Zhang 1904b1683b59SHong Zhang /* fast, crude version requires O(N*N) work */ 19059566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rowhit, cm)); 1906e99be685SHong Zhang 1907b7caf3d6SHong Zhang for (j = 0; j < n; j++) { /* loop over columns*/ 1908b1683b59SHong Zhang col = is[j]; 1909d2853540SHong Zhang row_idx = cj + ci[col]; 1910b1683b59SHong Zhang m = ci[col + 1] - ci[col]; 1911b7caf3d6SHong Zhang for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1912e99be685SHong Zhang idxhit[*row_idx] = spidx[ci[col] + k]; 1913d2853540SHong Zhang rowhit[*row_idx++] = col + 1; 1914b1683b59SHong Zhang } 1915b1683b59SHong Zhang } 1916b1683b59SHong Zhang /* count the number of hits */ 1917b1683b59SHong Zhang nrows = 0; 1918e8354b3eSHong Zhang for (j = 0; j < cm; j++) { 1919b1683b59SHong Zhang if (rowhit[j]) nrows++; 1920b1683b59SHong Zhang } 1921b1683b59SHong Zhang c->nrows[i] = nrows; 1922d2853540SHong Zhang colorforrow[i + 1] = colorforrow[i] + nrows; 1923d2853540SHong Zhang 1924b1683b59SHong Zhang nrows = 0; 1925b7caf3d6SHong Zhang for (j = 0; j < cm; j++) { /* loop over rows */ 1926b1683b59SHong Zhang if (rowhit[j]) { 1927d2853540SHong Zhang rows_i[nrows] = j; 192812b89a43SHong Zhang den2sp_i[nrows] = idxhit[j]; 1929b1683b59SHong Zhang nrows++; 1930b1683b59SHong Zhang } 1931b1683b59SHong Zhang } 1932e88777f2SHong Zhang den2sp_i += nrows; 1933077f23c2SHong Zhang 19349566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isa[i], &is)); 1935f99a636bSHong Zhang rows_i += nrows; 1936b1683b59SHong Zhang } 19379566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 19389566063dSJacob Faibussowitsch PetscCall(PetscFree(rowhit)); 19399566063dSJacob Faibussowitsch PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 19402c71b3e2SJacob Faibussowitsch PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1941b28d80bdSHong Zhang 1942d2853540SHong Zhang c->colorforrow = colorforrow; 1943d2853540SHong Zhang c->rows = rows; 1944f99a636bSHong Zhang c->den2sp = den2sp; 1945d2853540SHong Zhang c->colorforcol = colorforcol; 1946d2853540SHong Zhang c->columns = columns; 19472205254eSKarl Rupp 19489566063dSJacob Faibussowitsch PetscCall(PetscFree(idxhit)); 1949b1683b59SHong Zhang PetscFunctionReturn(0); 1950b1683b59SHong Zhang } 1951d013fc79Sandi selinger 19524222ddf1SHong Zhang /* --------------------------------------------------------------- */ 1953d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1954d71ae5a4SJacob Faibussowitsch { 19554222ddf1SHong Zhang Mat_Product *product = C->product; 19564222ddf1SHong Zhang Mat A = product->A, B = product->B; 19574222ddf1SHong Zhang 1958df97dc6dSFande Kong PetscFunctionBegin; 19594222ddf1SHong Zhang if (C->ops->mattransposemultnumeric) { 19604222ddf1SHong Zhang /* Alg: "outerproduct" */ 19619566063dSJacob Faibussowitsch PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 19624222ddf1SHong Zhang } else { 19634222ddf1SHong Zhang /* Alg: "matmatmult" -- C = At*B */ 19646718818eSStefano Zampini Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 19654222ddf1SHong Zhang 196628b400f6SJacob Faibussowitsch PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 19671cdffd5eSHong Zhang if (atb->At) { 19681cdffd5eSHong Zhang /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 19691cdffd5eSHong Zhang user may have called MatProductReplaceMats() to get this A=product->A */ 19701cdffd5eSHong Zhang PetscCall(MatTransposeSetPrecursor(A, atb->At)); 19717fb60732SBarry Smith PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 19724222ddf1SHong Zhang } 19737fb60732SBarry Smith PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 19744222ddf1SHong Zhang } 1975df97dc6dSFande Kong PetscFunctionReturn(0); 1976df97dc6dSFande Kong } 1977df97dc6dSFande Kong 1978d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1979d71ae5a4SJacob Faibussowitsch { 19804222ddf1SHong Zhang Mat_Product *product = C->product; 19814222ddf1SHong Zhang Mat A = product->A, B = product->B; 19824222ddf1SHong Zhang PetscReal fill = product->fill; 1983d013fc79Sandi selinger 1984d013fc79Sandi selinger PetscFunctionBegin; 19859566063dSJacob Faibussowitsch PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 19862869b61bSandi selinger 19874222ddf1SHong Zhang C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 19884222ddf1SHong Zhang PetscFunctionReturn(0); 19892869b61bSandi selinger } 1990d013fc79Sandi selinger 19914222ddf1SHong Zhang /* --------------------------------------------------------------- */ 1992d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1993d71ae5a4SJacob Faibussowitsch { 19944222ddf1SHong Zhang Mat_Product *product = C->product; 19954222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 19964222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 19974222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 19984222ddf1SHong Zhang const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 19994222ddf1SHong Zhang PetscInt nalg = 7; 20004222ddf1SHong Zhang #else 20014222ddf1SHong Zhang const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 20024222ddf1SHong Zhang PetscInt nalg = 8; 20034222ddf1SHong Zhang #endif 20044222ddf1SHong Zhang 20054222ddf1SHong Zhang PetscFunctionBegin; 20064222ddf1SHong Zhang /* Set default algorithm */ 20079566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 200848a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2009d013fc79Sandi selinger 20104222ddf1SHong Zhang /* Get runtime option */ 20114222ddf1SHong Zhang if (product->api_user) { 2012d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 20139566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2014d0609cedSBarry Smith PetscOptionsEnd(); 20154222ddf1SHong Zhang } else { 2016d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 20179566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2018d0609cedSBarry Smith PetscOptionsEnd(); 2019d013fc79Sandi selinger } 202048a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2021d013fc79Sandi selinger 20224222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AB; 20234222ddf1SHong Zhang C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 20244222ddf1SHong Zhang PetscFunctionReturn(0); 20254222ddf1SHong Zhang } 2026d013fc79Sandi selinger 2027d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2028d71ae5a4SJacob Faibussowitsch { 20294222ddf1SHong Zhang Mat_Product *product = C->product; 20304222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20314222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 2032089a957eSStefano Zampini const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2033089a957eSStefano Zampini PetscInt nalg = 3; 2034d013fc79Sandi selinger 20354222ddf1SHong Zhang PetscFunctionBegin; 20364222ddf1SHong Zhang /* Get runtime option */ 20374222ddf1SHong Zhang if (product->api_user) { 2038d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 20399566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2040d0609cedSBarry Smith PetscOptionsEnd(); 20414222ddf1SHong Zhang } else { 2042d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 20439566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2044d0609cedSBarry Smith PetscOptionsEnd(); 20454222ddf1SHong Zhang } 204648a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2047d013fc79Sandi selinger 20484222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 20494222ddf1SHong Zhang PetscFunctionReturn(0); 20504222ddf1SHong Zhang } 20514222ddf1SHong Zhang 2052d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2053d71ae5a4SJacob Faibussowitsch { 20544222ddf1SHong Zhang Mat_Product *product = C->product; 20554222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 20564222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20574222ddf1SHong Zhang const char *algTypes[2] = {"default", "color"}; 20584222ddf1SHong Zhang PetscInt nalg = 2; 20594222ddf1SHong Zhang 20604222ddf1SHong Zhang PetscFunctionBegin; 20614222ddf1SHong Zhang /* Set default algorithm */ 20629566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 20634222ddf1SHong Zhang if (!flg) { 20644222ddf1SHong Zhang alg = 1; 20659566063dSJacob Faibussowitsch PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20664222ddf1SHong Zhang } 20674222ddf1SHong Zhang 20684222ddf1SHong Zhang /* Get runtime option */ 20694222ddf1SHong Zhang if (product->api_user) { 2070d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 20719566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2072d0609cedSBarry Smith PetscOptionsEnd(); 20734222ddf1SHong Zhang } else { 2074d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 20759566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2076d0609cedSBarry Smith PetscOptionsEnd(); 20774222ddf1SHong Zhang } 207848a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 20794222ddf1SHong Zhang 20804222ddf1SHong Zhang C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 20814222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABt; 20824222ddf1SHong Zhang PetscFunctionReturn(0); 20834222ddf1SHong Zhang } 20844222ddf1SHong Zhang 2085d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2086d71ae5a4SJacob Faibussowitsch { 20874222ddf1SHong Zhang Mat_Product *product = C->product; 20884222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 20894222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 20904222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE) 20914222ddf1SHong Zhang const char *algTypes[2] = {"scalable", "rap"}; 20924222ddf1SHong Zhang PetscInt nalg = 2; 20934222ddf1SHong Zhang #else 20944222ddf1SHong Zhang const char *algTypes[3] = {"scalable", "rap", "hypre"}; 20954222ddf1SHong Zhang PetscInt nalg = 3; 20964222ddf1SHong Zhang #endif 20974222ddf1SHong Zhang 20984222ddf1SHong Zhang PetscFunctionBegin; 20994222ddf1SHong Zhang /* Set default algorithm */ 21009566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 210148a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21024222ddf1SHong Zhang 21034222ddf1SHong Zhang /* Get runtime option */ 21044222ddf1SHong Zhang if (product->api_user) { 2105d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 21069566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2107d0609cedSBarry Smith PetscOptionsEnd(); 21084222ddf1SHong Zhang } else { 2109d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 21109566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2111d0609cedSBarry Smith PetscOptionsEnd(); 21124222ddf1SHong Zhang } 211348a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21144222ddf1SHong Zhang 21154222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 21164222ddf1SHong Zhang PetscFunctionReturn(0); 21174222ddf1SHong Zhang } 21184222ddf1SHong Zhang 2119d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2120d71ae5a4SJacob Faibussowitsch { 21214222ddf1SHong Zhang Mat_Product *product = C->product; 21224222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21234222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21244222ddf1SHong Zhang const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 21254222ddf1SHong Zhang PetscInt nalg = 3; 21264222ddf1SHong Zhang 21274222ddf1SHong Zhang PetscFunctionBegin; 21284222ddf1SHong Zhang /* Set default algorithm */ 21299566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 213048a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21314222ddf1SHong Zhang 21324222ddf1SHong Zhang /* Get runtime option */ 21334222ddf1SHong Zhang if (product->api_user) { 2134d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 21359566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2136d0609cedSBarry Smith PetscOptionsEnd(); 21374222ddf1SHong Zhang } else { 2138d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 21399566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2140d0609cedSBarry Smith PetscOptionsEnd(); 21414222ddf1SHong Zhang } 214248a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21434222ddf1SHong Zhang 21444222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 21454222ddf1SHong Zhang PetscFunctionReturn(0); 21464222ddf1SHong Zhang } 21474222ddf1SHong Zhang 21484222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2149d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2150d71ae5a4SJacob Faibussowitsch { 21514222ddf1SHong Zhang Mat_Product *product = C->product; 21524222ddf1SHong Zhang PetscInt alg = 0; /* default algorithm */ 21534222ddf1SHong Zhang PetscBool flg = PETSC_FALSE; 21544222ddf1SHong Zhang const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 21554222ddf1SHong Zhang PetscInt nalg = 7; 21564222ddf1SHong Zhang 21574222ddf1SHong Zhang PetscFunctionBegin; 21584222ddf1SHong Zhang /* Set default algorithm */ 21599566063dSJacob Faibussowitsch PetscCall(PetscStrcmp(product->alg, "default", &flg)); 216048a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21614222ddf1SHong Zhang 21624222ddf1SHong Zhang /* Get runtime option */ 21634222ddf1SHong Zhang if (product->api_user) { 2164d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 21659566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2166d0609cedSBarry Smith PetscOptionsEnd(); 21674222ddf1SHong Zhang } else { 2168d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 21699566063dSJacob Faibussowitsch PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2170d0609cedSBarry Smith PetscOptionsEnd(); 21714222ddf1SHong Zhang } 217248a46eb9SPierre Jolivet if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 21734222ddf1SHong Zhang 21744222ddf1SHong Zhang C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 21754222ddf1SHong Zhang C->ops->productsymbolic = MatProductSymbolic_ABC; 21764222ddf1SHong Zhang PetscFunctionReturn(0); 21774222ddf1SHong Zhang } 21784222ddf1SHong Zhang 2179d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2180d71ae5a4SJacob Faibussowitsch { 21814222ddf1SHong Zhang Mat_Product *product = C->product; 21824222ddf1SHong Zhang 21834222ddf1SHong Zhang PetscFunctionBegin; 21844222ddf1SHong Zhang switch (product->type) { 2185d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB: 2186d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2187d71ae5a4SJacob Faibussowitsch break; 2188d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB: 2189d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2190d71ae5a4SJacob Faibussowitsch break; 2191d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 2192d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2193d71ae5a4SJacob Faibussowitsch break; 2194d71ae5a4SJacob Faibussowitsch case MATPRODUCT_PtAP: 2195d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2196d71ae5a4SJacob Faibussowitsch break; 2197d71ae5a4SJacob Faibussowitsch case MATPRODUCT_RARt: 2198d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2199d71ae5a4SJacob Faibussowitsch break; 2200d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 2201d71ae5a4SJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2202d71ae5a4SJacob Faibussowitsch break; 2203d71ae5a4SJacob Faibussowitsch default: 2204d71ae5a4SJacob Faibussowitsch break; 22054222ddf1SHong Zhang } 2206d013fc79Sandi selinger PetscFunctionReturn(0); 2207d013fc79Sandi selinger } 2208