149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 913f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 1049b5e25fSSatish Balay { 115eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 125d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 135d0c19d7SBarry Smith const PetscInt *idx; 14db41cccfSHong Zhang PetscBT table_out,table_in; 15d94109b8SHong Zhang 16d94109b8SHong Zhang PetscFunctionBegin; 1708401ef6SPierre Jolivet PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 18d94109b8SHong Zhang mbs = a->mbs; 19d94109b8SHong Zhang ai = a->i; 20d94109b8SHong Zhang aj = a->j; 21d0f46423SBarry Smith bs = A->rmap->bs; 229566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs,&table_out)); 239566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs+1,&nidx)); 249566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2)); 259566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(mbs,&table_in)); 26d94109b8SHong Zhang 27d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 28d94109b8SHong Zhang isz = 0; 299566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs,table_out)); 30d94109b8SHong Zhang 31d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 329566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i],&idx)); 339566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i],&n)); 34d94109b8SHong Zhang 35db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 36dbe03f88SHong Zhang bcol_max = 0; 37d94109b8SHong Zhang for (j=0; j<n; ++j) { 38d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 3908401ef6SPierre Jolivet PetscCheck(brow < mbs,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 40db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) { 41dbe03f88SHong Zhang nidx[isz++] = brow; 42dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 43dbe03f88SHong Zhang } 44d94109b8SHong Zhang } 459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i],&idx)); 469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is[i])); 47d94109b8SHong Zhang 48d94109b8SHong Zhang k = 0; 49d94109b8SHong Zhang for (j=0; j<ov; j++) { /* for each overlap */ 50db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 519566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(mbs,table_in)); 529566063dSJacob Faibussowitsch for (l=k; l<isz; l++) PetscCall(PetscBTSet(table_in,nidx[l])); 53d94109b8SHong Zhang 54d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 55d94109b8SHong Zhang for (brow=0; brow<mbs; brow++) { 56d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 57db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58d94109b8SHong Zhang for (l = start; l<end; l++) { 59d94109b8SHong Zhang bcol = aj[l]; 60d7b97159SHong Zhang if (!PetscBTLookupSet(table_out,bcol)) { 61d7b97159SHong Zhang nidx[isz++] = bcol; 62d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 63d7b97159SHong Zhang } 64d94109b8SHong Zhang } 65d94109b8SHong Zhang k++; 66d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68d94109b8SHong Zhang for (l = start; l<end; l++) { 69d94109b8SHong Zhang bcol = aj[l]; 70dbe03f88SHong Zhang if (bcol > bcol_max) break; 71db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)) { 7226fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 73d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } /* for each overlap */ 79d94109b8SHong Zhang 80d94109b8SHong Zhang /* expand the Index Set */ 81d94109b8SHong Zhang for (j=0; j<isz; j++) { 8226fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 83d94109b8SHong Zhang } 849566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i)); 85d94109b8SHong Zhang } 869566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_out)); 879566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 889566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx2)); 899566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table_in)); 905eee224dSHong Zhang PetscFunctionReturn(0); 9149b5e25fSSatish Balay } 9249b5e25fSSatish Balay 93847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 94847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 95847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 9649b5e25fSSatish Balay { 9749b5e25fSSatish Balay PetscFunctionBegin; 989566063dSJacob Faibussowitsch PetscCall(MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE)); 99f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 100f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 101f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 102f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 103f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 104f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 105f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 106f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 107f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 10849b5e25fSSatish Balay PetscFunctionReturn(0); 10949b5e25fSSatish Balay } 11049b5e25fSSatish Balay 1117dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 1127dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 113e50a8114SHong Zhang { 114e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 115e50a8114SHong Zhang PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 116e50a8114SHong Zhang PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 117e50a8114SHong Zhang const PetscInt *irow,*icol; 118e50a8114SHong Zhang PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 119e50a8114SHong Zhang PetscInt *aj = a->j,*ai = a->i; 120e50a8114SHong Zhang MatScalar *mat_a; 121e50a8114SHong Zhang Mat C; 122e50a8114SHong Zhang PetscBool flag; 123e50a8114SHong Zhang 124e50a8114SHong Zhang PetscFunctionBegin; 125e50a8114SHong Zhang 1269566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&irow)); 1279566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&icol)); 1289566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow,&nrows)); 1299566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol,&ncols)); 130e50a8114SHong Zhang 1319566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(1+oldcols,&smap)); 132e50a8114SHong Zhang ssmap = smap; 1339566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1+nrows,&lens)); 134e50a8114SHong Zhang for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 135e50a8114SHong Zhang /* determine lens of each row */ 136e50a8114SHong Zhang for (i=0; i<nrows; i++) { 137e50a8114SHong Zhang kstart = ai[irow[i]]; 138e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 139e50a8114SHong Zhang lens[i] = 0; 140e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 141e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 142e50a8114SHong Zhang } 143e50a8114SHong Zhang } 144e50a8114SHong Zhang /* Create and fill new matrix */ 145e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 146e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)((*B)->data); 147e50a8114SHong Zhang 148aed4548fSBarry Smith PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 1499566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag)); 15028b400f6SJacob Faibussowitsch PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1519566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(c->ilen,c->mbs)); 152e50a8114SHong Zhang C = *B; 153e50a8114SHong Zhang } else { 1549566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C)); 1559566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE)); 1569566063dSJacob Faibussowitsch PetscCall(MatSetType(C,((PetscObject)A)->type_name)); 1579566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(C,bs,0,lens)); 158e50a8114SHong Zhang } 159e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)(C->data); 160e50a8114SHong Zhang for (i=0; i<nrows; i++) { 161e50a8114SHong Zhang row = irow[i]; 162e50a8114SHong Zhang kstart = ai[row]; 163e50a8114SHong Zhang kend = kstart + a->ilen[row]; 164e50a8114SHong Zhang mat_i = c->i[i]; 165e50a8114SHong Zhang mat_j = c->j + mat_i; 166e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 167e50a8114SHong Zhang mat_ilen = c->ilen + i; 168e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 169e50a8114SHong Zhang if ((tcol=ssmap[a->j[k]])) { 170e50a8114SHong Zhang *mat_j++ = tcol - 1; 1719566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2)); 172e50a8114SHong Zhang mat_a += bs2; 173e50a8114SHong Zhang (*mat_ilen)++; 174e50a8114SHong Zhang } 175e50a8114SHong Zhang } 176e50a8114SHong Zhang } 177e50a8114SHong Zhang /* sort */ 178e50a8114SHong Zhang { 179e50a8114SHong Zhang MatScalar *work; 180e50a8114SHong Zhang 1819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs2,&work)); 182e50a8114SHong Zhang for (i=0; i<nrows; i++) { 183e50a8114SHong Zhang PetscInt ilen; 184e50a8114SHong Zhang mat_i = c->i[i]; 185e50a8114SHong Zhang mat_j = c->j + mat_i; 186e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 187e50a8114SHong Zhang ilen = c->ilen[i]; 1889566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work)); 189e50a8114SHong Zhang } 1909566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 191e50a8114SHong Zhang } 192e50a8114SHong Zhang 193e50a8114SHong Zhang /* Free work space */ 1949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&icol)); 1959566063dSJacob Faibussowitsch PetscCall(PetscFree(smap)); 1969566063dSJacob Faibussowitsch PetscCall(PetscFree(lens)); 1979566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 1989566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 199e50a8114SHong Zhang 2009566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&irow)); 201e50a8114SHong Zhang *B = C; 202e50a8114SHong Zhang PetscFunctionReturn(0); 203e50a8114SHong Zhang } 204e50a8114SHong Zhang 2057dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 20649b5e25fSSatish Balay { 207e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 208e50a8114SHong Zhang IS is1,is2; 209e50a8114SHong Zhang PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 210e50a8114SHong Zhang const PetscInt *irow,*icol; 21149b5e25fSSatish Balay 21249b5e25fSSatish Balay PetscFunctionBegin; 2139566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&irow)); 2149566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&icol)); 2159566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(isrow,&nrows)); 2169566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(iscol,&ncols)); 217e50a8114SHong Zhang 218e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 219e50a8114SHong Zhang and form the IS with compressed IS */ 220e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs,a->nbs); 2219566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary)); 2229566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary,a->mbs)); 223e50a8114SHong Zhang for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 224e50a8114SHong Zhang for (i=0; i<a->mbs; i++) { 225aed4548fSBarry Smith PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 226e50a8114SHong Zhang } 227e50a8114SHong Zhang count = 0; 228e50a8114SHong Zhang for (i=0; i<nrows; i++) { 229e50a8114SHong Zhang PetscInt j = irow[i] / bs; 230e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 231e50a8114SHong Zhang } 2329566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1)); 233e50a8114SHong Zhang 2349566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(vary,a->nbs)); 235e50a8114SHong Zhang for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 236e50a8114SHong Zhang for (i=0; i<a->nbs; i++) { 237aed4548fSBarry Smith PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 238e50a8114SHong Zhang } 239e50a8114SHong Zhang count = 0; 240e50a8114SHong Zhang for (i=0; i<ncols; i++) { 241e50a8114SHong Zhang PetscInt j = icol[i] / bs; 242e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 243e50a8114SHong Zhang } 2449566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2)); 2459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&irow)); 2469566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&icol)); 2479566063dSJacob Faibussowitsch PetscCall(PetscFree2(vary,iary)); 248e50a8114SHong Zhang 2499566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B)); 2509566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is1)); 2519566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is2)); 252847daeccSHong Zhang 2538f46ffcaSHong Zhang if (isrow != iscol) { 2548f46ffcaSHong Zhang PetscBool isequal; 2559566063dSJacob Faibussowitsch PetscCall(ISEqual(isrow,iscol,&isequal)); 256847daeccSHong Zhang if (!isequal) { 2579566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 2588f46ffcaSHong Zhang } 25949b5e25fSSatish Balay } 26049b5e25fSSatish Balay PetscFunctionReturn(0); 26149b5e25fSSatish Balay } 26249b5e25fSSatish Balay 2637dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 26449b5e25fSSatish Balay { 26513f74950SBarry Smith PetscInt i; 26649b5e25fSSatish Balay 26749b5e25fSSatish Balay PetscFunctionBegin; 268e50a8114SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 2699566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(n+1,B)); 270afebec48SHong Zhang } 271e50a8114SHong Zhang 272e50a8114SHong Zhang for (i=0; i<n; i++) { 2739566063dSJacob Faibussowitsch PetscCall(MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i])); 27449b5e25fSSatish Balay } 27549b5e25fSSatish Balay PetscFunctionReturn(0); 27649b5e25fSSatish Balay } 27749b5e25fSSatish Balay 27849b5e25fSSatish Balay /* -------------------------------------------------------*/ 27949b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 28049b5e25fSSatish Balay /* -------------------------------------------------------*/ 28149b5e25fSSatish Balay 282dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 28349b5e25fSSatish Balay { 28449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 285d9ca1df4SBarry Smith PetscScalar *z,x1,x2,zero=0.0; 286d9ca1df4SBarry Smith const PetscScalar *x,*xb; 287d9ca1df4SBarry Smith const MatScalar *v; 288d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 289d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 29098c9bda7SSatish Balay PetscInt nonzerorow=0; 29149b5e25fSSatish Balay 29249b5e25fSSatish Balay PetscFunctionBegin; 2939566063dSJacob Faibussowitsch PetscCall(VecSet(zz,zero)); 294c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 2959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 2969566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 29749b5e25fSSatish Balay 29849b5e25fSSatish Balay v = a->a; 29949b5e25fSSatish Balay xb = x; 30049b5e25fSSatish Balay 30149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 30249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 30349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 30449b5e25fSSatish Balay ib = aj + *ai; 305831a3094SHong Zhang jmin = 0; 30698c9bda7SSatish Balay nonzerorow += (n>0); 3077fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 30849b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 30949b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 310831a3094SHong Zhang v += 4; jmin++; 3117fbae186SHong Zhang } 312444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 313444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 314831a3094SHong Zhang for (j=jmin; j<n; j++) { 31549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 31649b5e25fSSatish Balay cval = ib[j]*2; 31749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 31849b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 31949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32049b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 32149b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 32249b5e25fSSatish Balay v += 4; 32349b5e25fSSatish Balay } 32449b5e25fSSatish Balay xb +=2; ai++; 32549b5e25fSSatish Balay } 32649b5e25fSSatish Balay 3279566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 3289566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 3299566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 33049b5e25fSSatish Balay PetscFunctionReturn(0); 33149b5e25fSSatish Balay } 33249b5e25fSSatish Balay 333dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 33449b5e25fSSatish Balay { 33549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 336d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,zero=0.0; 337d9ca1df4SBarry Smith const PetscScalar *x,*xb; 338d9ca1df4SBarry Smith const MatScalar *v; 339d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 340d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 34198c9bda7SSatish Balay PetscInt nonzerorow=0; 34249b5e25fSSatish Balay 34349b5e25fSSatish Balay PetscFunctionBegin; 3449566063dSJacob Faibussowitsch PetscCall(VecSet(zz,zero)); 345c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 3469566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 3479566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 34849b5e25fSSatish Balay 34949b5e25fSSatish Balay v = a->a; 35049b5e25fSSatish Balay xb = x; 35149b5e25fSSatish Balay 35249b5e25fSSatish Balay for (i=0; i<mbs; i++) { 35349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 35449b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 35549b5e25fSSatish Balay ib = aj + *ai; 356831a3094SHong Zhang jmin = 0; 35798c9bda7SSatish Balay nonzerorow += (n>0); 3587fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 35949b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 36049b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 36149b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 362831a3094SHong Zhang v += 9; jmin++; 3637fbae186SHong Zhang } 364444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 365444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 366831a3094SHong Zhang for (j=jmin; j<n; j++) { 36749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 36849b5e25fSSatish Balay cval = ib[j]*3; 36949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 37049b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 37149b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 37249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 37349b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 37449b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 37549b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 37649b5e25fSSatish Balay v += 9; 37749b5e25fSSatish Balay } 37849b5e25fSSatish Balay xb +=3; ai++; 37949b5e25fSSatish Balay } 38049b5e25fSSatish Balay 3819566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 3829566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 3839566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 38449b5e25fSSatish Balay PetscFunctionReturn(0); 38549b5e25fSSatish Balay } 38649b5e25fSSatish Balay 387dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 38849b5e25fSSatish Balay { 38949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 390d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,zero=0.0; 391d9ca1df4SBarry Smith const PetscScalar *x,*xb; 392d9ca1df4SBarry Smith const MatScalar *v; 393d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 394d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 39598c9bda7SSatish Balay PetscInt nonzerorow = 0; 39649b5e25fSSatish Balay 39749b5e25fSSatish Balay PetscFunctionBegin; 3989566063dSJacob Faibussowitsch PetscCall(VecSet(zz,zero)); 399c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 4009566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 4019566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 40249b5e25fSSatish Balay 40349b5e25fSSatish Balay v = a->a; 40449b5e25fSSatish Balay xb = x; 40549b5e25fSSatish Balay 40649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 40749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 40849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 40949b5e25fSSatish Balay ib = aj + *ai; 410831a3094SHong Zhang jmin = 0; 41198c9bda7SSatish Balay nonzerorow += (n>0); 4127fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 41349b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 41449b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 41549b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 41649b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 417831a3094SHong Zhang v += 16; jmin++; 4187fbae186SHong Zhang } 419444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 420444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 421831a3094SHong Zhang for (j=jmin; j<n; j++) { 42249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 42349b5e25fSSatish Balay cval = ib[j]*4; 42449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 42549b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 42649b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 42749b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 42849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 42949b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 43049b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 43149b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 43249b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 43349b5e25fSSatish Balay v += 16; 43449b5e25fSSatish Balay } 43549b5e25fSSatish Balay xb +=4; ai++; 43649b5e25fSSatish Balay } 43749b5e25fSSatish Balay 4389566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 4399566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 4409566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 44149b5e25fSSatish Balay PetscFunctionReturn(0); 44249b5e25fSSatish Balay } 44349b5e25fSSatish Balay 444dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 44549b5e25fSSatish Balay { 44649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 447d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 448d9ca1df4SBarry Smith const PetscScalar *x,*xb; 449d9ca1df4SBarry Smith const MatScalar *v; 450d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 451d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 45298c9bda7SSatish Balay PetscInt nonzerorow=0; 45349b5e25fSSatish Balay 45449b5e25fSSatish Balay PetscFunctionBegin; 4559566063dSJacob Faibussowitsch PetscCall(VecSet(zz,zero)); 456c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 4579566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 4589566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 45949b5e25fSSatish Balay 46049b5e25fSSatish Balay v = a->a; 46149b5e25fSSatish Balay xb = x; 46249b5e25fSSatish Balay 46349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 46449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 46549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 46649b5e25fSSatish Balay ib = aj + *ai; 467831a3094SHong Zhang jmin = 0; 46898c9bda7SSatish Balay nonzerorow += (n>0); 4697fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 47049b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 47149b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 47249b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 47349b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 47449b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 475831a3094SHong Zhang v += 25; jmin++; 4767fbae186SHong Zhang } 477444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 478444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 479831a3094SHong Zhang for (j=jmin; j<n; j++) { 48049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 48149b5e25fSSatish Balay cval = ib[j]*5; 48249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 48349b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 48449b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 48549b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 48649b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 48749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 48849b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 48949b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 49049b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 49149b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 49249b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 49349b5e25fSSatish Balay v += 25; 49449b5e25fSSatish Balay } 49549b5e25fSSatish Balay xb +=5; ai++; 49649b5e25fSSatish Balay } 49749b5e25fSSatish Balay 4989566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 4999566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 5009566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 50149b5e25fSSatish Balay PetscFunctionReturn(0); 50249b5e25fSSatish Balay } 50349b5e25fSSatish Balay 504dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 50549b5e25fSSatish Balay { 50649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 507d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 508d9ca1df4SBarry Smith const PetscScalar *x,*xb; 509d9ca1df4SBarry Smith const MatScalar *v; 510d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 511d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 51298c9bda7SSatish Balay PetscInt nonzerorow=0; 51349b5e25fSSatish Balay 51449b5e25fSSatish Balay PetscFunctionBegin; 5159566063dSJacob Faibussowitsch PetscCall(VecSet(zz,zero)); 516c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 5189566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 51949b5e25fSSatish Balay 52049b5e25fSSatish Balay v = a->a; 52149b5e25fSSatish Balay xb = x; 52249b5e25fSSatish Balay 52349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 52449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 52549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 52649b5e25fSSatish Balay ib = aj + *ai; 527831a3094SHong Zhang jmin = 0; 52898c9bda7SSatish Balay nonzerorow += (n>0); 5297fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 53049b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 53149b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 53249b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 53349b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 53449b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 53549b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 536831a3094SHong Zhang v += 36; jmin++; 5377fbae186SHong Zhang } 538444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 539444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 540831a3094SHong Zhang for (j=jmin; j<n; j++) { 54149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 54249b5e25fSSatish Balay cval = ib[j]*6; 54349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 54449b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 54549b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 54649b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 54749b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 54849b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 54949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 55049b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 55149b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 55249b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 55349b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 55449b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 55549b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 55649b5e25fSSatish Balay v += 36; 55749b5e25fSSatish Balay } 55849b5e25fSSatish Balay xb +=6; ai++; 55949b5e25fSSatish Balay } 56049b5e25fSSatish Balay 5619566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 5629566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 5639566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 56449b5e25fSSatish Balay PetscFunctionReturn(0); 56549b5e25fSSatish Balay } 566c2916339SPierre Jolivet 567dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 56849b5e25fSSatish Balay { 56949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 570d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 571d9ca1df4SBarry Smith const PetscScalar *x,*xb; 572d9ca1df4SBarry Smith const MatScalar *v; 573d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 574d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 57598c9bda7SSatish Balay PetscInt nonzerorow=0; 57649b5e25fSSatish Balay 57749b5e25fSSatish Balay PetscFunctionBegin; 5789566063dSJacob Faibussowitsch PetscCall(VecSet(zz,zero)); 579c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 5809566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 5819566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 58249b5e25fSSatish Balay 58349b5e25fSSatish Balay v = a->a; 58449b5e25fSSatish Balay xb = x; 58549b5e25fSSatish Balay 58649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 58749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 58849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 58949b5e25fSSatish Balay ib = aj + *ai; 590831a3094SHong Zhang jmin = 0; 59198c9bda7SSatish Balay nonzerorow += (n>0); 5927fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 59349b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 59449b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 59549b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 59649b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 59749b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 59849b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 59949b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 600831a3094SHong Zhang v += 49; jmin++; 6017fbae186SHong Zhang } 602444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 603444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 604831a3094SHong Zhang for (j=jmin; j<n; j++) { 60549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 60649b5e25fSSatish Balay cval = ib[j]*7; 60749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 60849b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 60949b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 61049b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 61149b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 61249b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 61349b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 61449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 61549b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 61649b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 61749b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 61849b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 61949b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 62049b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 62149b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 62249b5e25fSSatish Balay v += 49; 62349b5e25fSSatish Balay } 62449b5e25fSSatish Balay xb +=7; ai++; 62549b5e25fSSatish Balay } 6269566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 6279566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 6289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 62949b5e25fSSatish Balay PetscFunctionReturn(0); 63049b5e25fSSatish Balay } 63149b5e25fSSatish Balay 63249b5e25fSSatish Balay /* 63349b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 63449b5e25fSSatish Balay */ 635dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 63649b5e25fSSatish Balay { 63749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 638d9ca1df4SBarry Smith PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 639d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 640d9ca1df4SBarry Smith const MatScalar *v; 641d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 642d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 64398c9bda7SSatish Balay PetscInt nonzerorow=0; 64449b5e25fSSatish Balay 64549b5e25fSSatish Balay PetscFunctionBegin; 6469566063dSJacob Faibussowitsch PetscCall(VecSet(zz,zero)); 647c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 6489566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 6499566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 65059689ffbSStefano Zampini 65159689ffbSStefano Zampini x_ptr = x; 65259689ffbSStefano Zampini z_ptr = z; 65349b5e25fSSatish Balay 65449b5e25fSSatish Balay aj = a->j; 65549b5e25fSSatish Balay v = a->a; 65649b5e25fSSatish Balay ii = a->i; 65749b5e25fSSatish Balay 65849b5e25fSSatish Balay if (!a->mult_work) { 6599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->N+1,&a->mult_work)); 66049b5e25fSSatish Balay } 66149b5e25fSSatish Balay work = a->mult_work; 66249b5e25fSSatish Balay 66349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 66449b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 66549b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 66698c9bda7SSatish Balay nonzerorow += (n>0); 66749b5e25fSSatish Balay 66849b5e25fSSatish Balay /* upper triangular part */ 66949b5e25fSSatish Balay for (j=0; j<n; j++) { 67049b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 67149b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 67249b5e25fSSatish Balay workt += bs; 67349b5e25fSSatish Balay } 67449b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 67596b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 67649b5e25fSSatish Balay 67749b5e25fSSatish Balay /* strict lower triangular part */ 678831a3094SHong Zhang idx = aj+ii[0]; 67959689ffbSStefano Zampini if (n && *idx == i) { 68096b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 681831a3094SHong Zhang } 68296b9376eSHong Zhang 68349b5e25fSSatish Balay if (ncols > 0) { 68449b5e25fSSatish Balay workt = work; 6859566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt,ncols)); 68696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 687831a3094SHong Zhang for (j=0; j<n; j++) { 688831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 68949b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 69049b5e25fSSatish Balay workt += bs; 69149b5e25fSSatish Balay } 69249b5e25fSSatish Balay } 69349b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 69449b5e25fSSatish Balay } 69549b5e25fSSatish Balay 6969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 6979566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 6989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow)); 69949b5e25fSSatish Balay PetscFunctionReturn(0); 70049b5e25fSSatish Balay } 70149b5e25fSSatish Balay 702dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 70349b5e25fSSatish Balay { 70449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 705d9ca1df4SBarry Smith PetscScalar *z,x1; 706d9ca1df4SBarry Smith const PetscScalar *x,*xb; 707d9ca1df4SBarry Smith const MatScalar *v; 708d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 709d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 71098c9bda7SSatish Balay PetscInt nonzerorow=0; 711eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 712*b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 713eb1ec7c1SStefano Zampini #else 714eb1ec7c1SStefano Zampini const int aconj = 0; 715eb1ec7c1SStefano Zampini #endif 71649b5e25fSSatish Balay 71749b5e25fSSatish Balay PetscFunctionBegin; 7189566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 719c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 7209566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 7219566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 72249b5e25fSSatish Balay v = a->a; 72349b5e25fSSatish Balay xb = x; 72449b5e25fSSatish Balay 72549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 72649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 72749b5e25fSSatish Balay x1 = xb[0]; 72849b5e25fSSatish Balay ib = aj + *ai; 729831a3094SHong Zhang jmin = 0; 73098c9bda7SSatish Balay nonzerorow += (n>0); 7313d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 732831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 733831a3094SHong Zhang } 734eb1ec7c1SStefano Zampini if (aconj) { 735eb1ec7c1SStefano Zampini for (j=jmin; j<n; j++) { 736eb1ec7c1SStefano Zampini cval = *ib; 737eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 738eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 739eb1ec7c1SStefano Zampini } 740eb1ec7c1SStefano Zampini } else { 741831a3094SHong Zhang for (j=jmin; j<n; j++) { 74249b5e25fSSatish Balay cval = *ib; 74349b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 74449b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 74549b5e25fSSatish Balay } 746eb1ec7c1SStefano Zampini } 74749b5e25fSSatish Balay xb++; ai++; 74849b5e25fSSatish Balay } 74949b5e25fSSatish Balay 7509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 7519566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 75249b5e25fSSatish Balay 7539566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow))); 75449b5e25fSSatish Balay PetscFunctionReturn(0); 75549b5e25fSSatish Balay } 75649b5e25fSSatish Balay 757dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 75849b5e25fSSatish Balay { 75949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 760d9ca1df4SBarry Smith PetscScalar *z,x1,x2; 761d9ca1df4SBarry Smith const PetscScalar *x,*xb; 762d9ca1df4SBarry Smith const MatScalar *v; 763d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 764d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 76598c9bda7SSatish Balay PetscInt nonzerorow=0; 76649b5e25fSSatish Balay 76749b5e25fSSatish Balay PetscFunctionBegin; 7689566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 769c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 7709566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 7719566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 77249b5e25fSSatish Balay 77349b5e25fSSatish Balay v = a->a; 77449b5e25fSSatish Balay xb = x; 77549b5e25fSSatish Balay 77649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 77749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 77849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 77949b5e25fSSatish Balay ib = aj + *ai; 780831a3094SHong Zhang jmin = 0; 78198c9bda7SSatish Balay nonzerorow += (n>0); 78259689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 78349b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 78449b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 785831a3094SHong Zhang v += 4; jmin++; 7867fbae186SHong Zhang } 787444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 788444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 789831a3094SHong Zhang for (j=jmin; j<n; j++) { 79049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 79149b5e25fSSatish Balay cval = ib[j]*2; 79249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 79349b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 79449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 79549b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 79649b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 79749b5e25fSSatish Balay v += 4; 79849b5e25fSSatish Balay } 79949b5e25fSSatish Balay xb +=2; ai++; 80049b5e25fSSatish Balay } 8019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 8029566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 80349b5e25fSSatish Balay 8049566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow))); 80549b5e25fSSatish Balay PetscFunctionReturn(0); 80649b5e25fSSatish Balay } 80749b5e25fSSatish Balay 808dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 80949b5e25fSSatish Balay { 81049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 811d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3; 812d9ca1df4SBarry Smith const PetscScalar *x,*xb; 813d9ca1df4SBarry Smith const MatScalar *v; 814d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 815d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 81698c9bda7SSatish Balay PetscInt nonzerorow=0; 81749b5e25fSSatish Balay 81849b5e25fSSatish Balay PetscFunctionBegin; 8199566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 820c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 8219566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 8229566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 82349b5e25fSSatish Balay 82449b5e25fSSatish Balay v = a->a; 82549b5e25fSSatish Balay xb = x; 82649b5e25fSSatish Balay 82749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 82849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 82949b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 83049b5e25fSSatish Balay ib = aj + *ai; 831831a3094SHong Zhang jmin = 0; 83298c9bda7SSatish Balay nonzerorow += (n>0); 83359689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 83449b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 83549b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 83649b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 837831a3094SHong Zhang v += 9; jmin++; 8387fbae186SHong Zhang } 839444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 840444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 841831a3094SHong Zhang for (j=jmin; j<n; j++) { 84249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 84349b5e25fSSatish Balay cval = ib[j]*3; 84449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 84549b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 84649b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 84749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 84849b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 84949b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 85049b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 85149b5e25fSSatish Balay v += 9; 85249b5e25fSSatish Balay } 85349b5e25fSSatish Balay xb +=3; ai++; 85449b5e25fSSatish Balay } 85549b5e25fSSatish Balay 8569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 8579566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 85849b5e25fSSatish Balay 8599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow))); 86049b5e25fSSatish Balay PetscFunctionReturn(0); 86149b5e25fSSatish Balay } 86249b5e25fSSatish Balay 863dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 86449b5e25fSSatish Balay { 86549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 866d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4; 867d9ca1df4SBarry Smith const PetscScalar *x,*xb; 868d9ca1df4SBarry Smith const MatScalar *v; 869d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 870d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 87198c9bda7SSatish Balay PetscInt nonzerorow=0; 87249b5e25fSSatish Balay 87349b5e25fSSatish Balay PetscFunctionBegin; 8749566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 875c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 8769566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 8779566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 87849b5e25fSSatish Balay 87949b5e25fSSatish Balay v = a->a; 88049b5e25fSSatish Balay xb = x; 88149b5e25fSSatish Balay 88249b5e25fSSatish Balay for (i=0; i<mbs; i++) { 88349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 88449b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 88549b5e25fSSatish Balay ib = aj + *ai; 886831a3094SHong Zhang jmin = 0; 88798c9bda7SSatish Balay nonzerorow += (n>0); 88859689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 88949b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 89049b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 89149b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 89249b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 893831a3094SHong Zhang v += 16; jmin++; 8947fbae186SHong Zhang } 895444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 896444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 897831a3094SHong Zhang for (j=jmin; j<n; j++) { 89849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 89949b5e25fSSatish Balay cval = ib[j]*4; 90049b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 90149b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 90249b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 90349b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 90449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 90549b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 90649b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 90749b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 90849b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 90949b5e25fSSatish Balay v += 16; 91049b5e25fSSatish Balay } 91149b5e25fSSatish Balay xb +=4; ai++; 91249b5e25fSSatish Balay } 91349b5e25fSSatish Balay 9149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 9159566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 91649b5e25fSSatish Balay 9179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow))); 91849b5e25fSSatish Balay PetscFunctionReturn(0); 91949b5e25fSSatish Balay } 92049b5e25fSSatish Balay 921dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 92249b5e25fSSatish Balay { 92349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 924d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5; 925d9ca1df4SBarry Smith const PetscScalar *x,*xb; 926d9ca1df4SBarry Smith const MatScalar *v; 927d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 928d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 92998c9bda7SSatish Balay PetscInt nonzerorow=0; 93049b5e25fSSatish Balay 93149b5e25fSSatish Balay PetscFunctionBegin; 9329566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 933c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 9349566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 9359566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 93649b5e25fSSatish Balay 93749b5e25fSSatish Balay v = a->a; 93849b5e25fSSatish Balay xb = x; 93949b5e25fSSatish Balay 94049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 94149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 94249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 94349b5e25fSSatish Balay ib = aj + *ai; 944831a3094SHong Zhang jmin = 0; 94598c9bda7SSatish Balay nonzerorow += (n>0); 94659689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 94749b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 94849b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 94949b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 95049b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 95149b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 952831a3094SHong Zhang v += 25; jmin++; 9537fbae186SHong Zhang } 954444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 955444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 956831a3094SHong Zhang for (j=jmin; j<n; j++) { 95749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 95849b5e25fSSatish Balay cval = ib[j]*5; 95949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 96049b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 96149b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 96249b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 96349b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 96449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 96549b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 96649b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 96749b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 96849b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 96949b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 97049b5e25fSSatish Balay v += 25; 97149b5e25fSSatish Balay } 97249b5e25fSSatish Balay xb +=5; ai++; 97349b5e25fSSatish Balay } 97449b5e25fSSatish Balay 9759566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 9769566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 97749b5e25fSSatish Balay 9789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow))); 97949b5e25fSSatish Balay PetscFunctionReturn(0); 98049b5e25fSSatish Balay } 981c2916339SPierre Jolivet 982dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 98349b5e25fSSatish Balay { 98449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 985d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6; 986d9ca1df4SBarry Smith const PetscScalar *x,*xb; 987d9ca1df4SBarry Smith const MatScalar *v; 988d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 989d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 99098c9bda7SSatish Balay PetscInt nonzerorow=0; 99149b5e25fSSatish Balay 99249b5e25fSSatish Balay PetscFunctionBegin; 9939566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 994c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 9959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 9969566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 99749b5e25fSSatish Balay 99849b5e25fSSatish Balay v = a->a; 99949b5e25fSSatish Balay xb = x; 100049b5e25fSSatish Balay 100149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 100249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 100349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 100449b5e25fSSatish Balay ib = aj + *ai; 1005831a3094SHong Zhang jmin = 0; 100698c9bda7SSatish Balay nonzerorow += (n>0); 100759689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 100849b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 100949b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 101049b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 101149b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 101249b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 101349b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1014831a3094SHong Zhang v += 36; jmin++; 10157fbae186SHong Zhang } 1016444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1017444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1018831a3094SHong Zhang for (j=jmin; j<n; j++) { 101949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 102049b5e25fSSatish Balay cval = ib[j]*6; 102149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 102249b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 102349b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 102449b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 102549b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 102649b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 102749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 102849b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 102949b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 103049b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 103149b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 103249b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 103349b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 103449b5e25fSSatish Balay v += 36; 103549b5e25fSSatish Balay } 103649b5e25fSSatish Balay xb +=6; ai++; 103749b5e25fSSatish Balay } 103849b5e25fSSatish Balay 10399566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 10409566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 104149b5e25fSSatish Balay 10429566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow))); 104349b5e25fSSatish Balay PetscFunctionReturn(0); 104449b5e25fSSatish Balay } 104549b5e25fSSatish Balay 1046dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 104749b5e25fSSatish Balay { 104849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1049d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1050d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1051d9ca1df4SBarry Smith const MatScalar *v; 1052d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1053d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 105498c9bda7SSatish Balay PetscInt nonzerorow=0; 105549b5e25fSSatish Balay 105649b5e25fSSatish Balay PetscFunctionBegin; 10579566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 1058c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 10599566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); 10609566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); 106149b5e25fSSatish Balay 106249b5e25fSSatish Balay v = a->a; 106349b5e25fSSatish Balay xb = x; 106449b5e25fSSatish Balay 106549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 106649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 106749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 106849b5e25fSSatish Balay ib = aj + *ai; 1069831a3094SHong Zhang jmin = 0; 107098c9bda7SSatish Balay nonzerorow += (n>0); 107159689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 107249b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 107349b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 107449b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 107549b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 107649b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 107749b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 107849b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1079831a3094SHong Zhang v += 49; jmin++; 10807fbae186SHong Zhang } 1081444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1082444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1083831a3094SHong Zhang for (j=jmin; j<n; j++) { 108449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 108549b5e25fSSatish Balay cval = ib[j]*7; 108649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 108749b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 108849b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 108949b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 109049b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 109149b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 109249b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 109349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 109449b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 109549b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 109649b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 109749b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 109849b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 109949b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 110049b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 110149b5e25fSSatish Balay v += 49; 110249b5e25fSSatish Balay } 110349b5e25fSSatish Balay xb +=7; ai++; 110449b5e25fSSatish Balay } 110549b5e25fSSatish Balay 11069566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 11079566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 110849b5e25fSSatish Balay 11099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow))); 111049b5e25fSSatish Balay PetscFunctionReturn(0); 111149b5e25fSSatish Balay } 111249b5e25fSSatish Balay 1113dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 111449b5e25fSSatish Balay { 111549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1116f4259b30SLisandro Dalcin PetscScalar *z,*z_ptr=NULL,*zb,*work,*workt; 1117d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 1118d9ca1df4SBarry Smith const MatScalar *v; 1119d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1120d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 112198c9bda7SSatish Balay PetscInt nonzerorow=0; 112249b5e25fSSatish Balay 112349b5e25fSSatish Balay PetscFunctionBegin; 11249566063dSJacob Faibussowitsch PetscCall(VecCopy(yy,zz)); 1125c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 11269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx,&x)); x_ptr=x; 11279566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz,&z)); z_ptr=z; 112849b5e25fSSatish Balay 112949b5e25fSSatish Balay aj = a->j; 113049b5e25fSSatish Balay v = a->a; 113149b5e25fSSatish Balay ii = a->i; 113249b5e25fSSatish Balay 113349b5e25fSSatish Balay if (!a->mult_work) { 11349566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n+1,&a->mult_work)); 113549b5e25fSSatish Balay } 113649b5e25fSSatish Balay work = a->mult_work; 113749b5e25fSSatish Balay 113849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 113949b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 114049b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 114198c9bda7SSatish Balay nonzerorow += (n>0); 114249b5e25fSSatish Balay 114349b5e25fSSatish Balay /* upper triangular part */ 114449b5e25fSSatish Balay for (j=0; j<n; j++) { 114549b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 114649b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 114749b5e25fSSatish Balay workt += bs; 114849b5e25fSSatish Balay } 114949b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 115096b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 115149b5e25fSSatish Balay 115249b5e25fSSatish Balay /* strict lower triangular part */ 1153831a3094SHong Zhang idx = aj+ii[0]; 115459689ffbSStefano Zampini if (n && *idx == i) { 115596b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1156831a3094SHong Zhang } 115749b5e25fSSatish Balay if (ncols > 0) { 115849b5e25fSSatish Balay workt = work; 11599566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(workt,ncols)); 116096b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1161831a3094SHong Zhang for (j=0; j<n; j++) { 1162831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 116349b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 116449b5e25fSSatish Balay workt += bs; 116549b5e25fSSatish Balay } 116649b5e25fSSatish Balay } 116749b5e25fSSatish Balay 116849b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 116949b5e25fSSatish Balay } 117049b5e25fSSatish Balay 11719566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx,&x)); 11729566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz,&z)); 117349b5e25fSSatish Balay 11749566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow))); 117549b5e25fSSatish Balay PetscFunctionReturn(0); 117649b5e25fSSatish Balay } 117749b5e25fSSatish Balay 1178f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 117949b5e25fSSatish Balay { 118049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1181f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1182c5df96a5SBarry Smith PetscBLASInt one = 1,totalnz; 118349b5e25fSSatish Balay 118449b5e25fSSatish Balay PetscFunctionBegin; 11859566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(a->bs2*a->nz,&totalnz)); 11868b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 11879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(totalnz)); 118849b5e25fSSatish Balay PetscFunctionReturn(0); 118949b5e25fSSatish Balay } 119049b5e25fSSatish Balay 1191dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 119249b5e25fSSatish Balay { 119349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1194d9ca1df4SBarry Smith const MatScalar *v = a->a; 119549b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1196d9ca1df4SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1197d9ca1df4SBarry Smith const PetscInt *aj=a->j,*col; 119849b5e25fSSatish Balay 119949b5e25fSSatish Balay PetscFunctionBegin; 1200c40ae873SPierre Jolivet if (!a->nz) { 1201c40ae873SPierre Jolivet *norm = 0.0; 1202c40ae873SPierre Jolivet PetscFunctionReturn(0); 1203c40ae873SPierre Jolivet } 120449b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 120549b5e25fSSatish Balay for (k=0; k<mbs; k++) { 120659689ffbSStefano Zampini jmin = a->i[k]; 120759689ffbSStefano Zampini jmax = a->i[k+1]; 1208831a3094SHong Zhang col = aj + jmin; 120959689ffbSStefano Zampini if (jmax-jmin > 0 && *col == k) { /* diagonal block */ 121049b5e25fSSatish Balay for (i=0; i<bs2; i++) { 121149b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 121249b5e25fSSatish Balay } 1213831a3094SHong Zhang jmin++; 1214831a3094SHong Zhang } 1215831a3094SHong Zhang for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 121649b5e25fSSatish Balay for (i=0; i<bs2; i++) { 121749b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 121849b5e25fSSatish Balay } 121949b5e25fSSatish Balay } 122049b5e25fSSatish Balay } 12218f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2*sum_off); 12229566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*bs2*a->nz)); 12230b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 12249566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl)); 12250b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 12260b8dc8d2SHong Zhang il[0] = 0; 122749b5e25fSSatish Balay 122849b5e25fSSatish Balay *norm = 0.0; 122949b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 123049b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 123149b5e25fSSatish Balay /*-- col sum --*/ 123249b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1233831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1234831a3094SHong Zhang at step k */ 123549b5e25fSSatish Balay while (i<mbs) { 123649b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 123749b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 123849b5e25fSSatish Balay for (j=0; j<bs; j++) { 123949b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 124049b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 124149b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 124249b5e25fSSatish Balay } 124349b5e25fSSatish Balay } 124449b5e25fSSatish Balay /* update il, jl */ 1245831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1246831a3094SHong Zhang jmax = a->i[i+1]; 124749b5e25fSSatish Balay if (jmin < jmax) { 124849b5e25fSSatish Balay il[i] = jmin; 124949b5e25fSSatish Balay j = a->j[jmin]; 125049b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 125149b5e25fSSatish Balay } 125249b5e25fSSatish Balay i = nexti; 125349b5e25fSSatish Balay } 125449b5e25fSSatish Balay /*-- row sum --*/ 125559689ffbSStefano Zampini jmin = a->i[k]; 125659689ffbSStefano Zampini jmax = a->i[k+1]; 125749b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 125849b5e25fSSatish Balay for (j=0; j<bs; j++) { 125949b5e25fSSatish Balay v = a->a + i*bs2 + j; 126049b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 12610b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 126249b5e25fSSatish Balay } 126349b5e25fSSatish Balay } 126449b5e25fSSatish Balay } 126549b5e25fSSatish Balay /* add k_th block row to il, jl */ 1266831a3094SHong Zhang col = aj+jmin; 126759689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 126849b5e25fSSatish Balay if (jmin < jmax) { 126949b5e25fSSatish Balay il[k] = jmin; 12700b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 127149b5e25fSSatish Balay } 127249b5e25fSSatish Balay for (j=0; j<bs; j++) { 127349b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 127449b5e25fSSatish Balay } 127549b5e25fSSatish Balay } 12769566063dSJacob Faibussowitsch PetscCall(PetscFree3(sum,il,jl)); 12779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(PetscMax(mbs*a->nz-1,0))); 1278f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 127949b5e25fSSatish Balay PetscFunctionReturn(0); 128049b5e25fSSatish Balay } 128149b5e25fSSatish Balay 1282ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 128349b5e25fSSatish Balay { 128449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 128549b5e25fSSatish Balay 128649b5e25fSSatish Balay PetscFunctionBegin; 128749b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1288d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1289ef511fbeSHong Zhang *flg = PETSC_FALSE; 1290ef511fbeSHong Zhang PetscFunctionReturn(0); 129149b5e25fSSatish Balay } 129249b5e25fSSatish Balay 129349b5e25fSSatish Balay /* if the a->i are the same */ 12949566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg)); 129526fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 129649b5e25fSSatish Balay 129749b5e25fSSatish Balay /* if a->j are the same */ 12989566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg)); 129926fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 130026fbe8dcSKarl Rupp 130149b5e25fSSatish Balay /* if a->a are the same */ 13029566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg)); 1303935af2e7SHong Zhang PetscFunctionReturn(0); 130449b5e25fSSatish Balay } 130549b5e25fSSatish Balay 1306dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 130749b5e25fSSatish Balay { 130849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1309d9ca1df4SBarry Smith PetscInt i,j,k,row,bs,ambs,bs2; 1310d9ca1df4SBarry Smith const PetscInt *ai,*aj; 131187828ca2SBarry Smith PetscScalar *x,zero = 0.0; 1312d9ca1df4SBarry Smith const MatScalar *aa,*aa_j; 131349b5e25fSSatish Balay 131449b5e25fSSatish Balay PetscFunctionBegin; 1315d0f46423SBarry Smith bs = A->rmap->bs; 1316aed4548fSBarry Smith PetscCheck(!A->factortype || bs <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 131782799104SHong Zhang 131849b5e25fSSatish Balay aa = a->a; 13198a0c6efdSHong Zhang ambs = a->mbs; 13208a0c6efdSHong Zhang 13218a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13228a0c6efdSHong Zhang PetscInt *diag=a->diag; 13238a0c6efdSHong Zhang aa = a->a; 13248a0c6efdSHong Zhang ambs = a->mbs; 13259566063dSJacob Faibussowitsch PetscCall(VecGetArray(v,&x)); 13268a0c6efdSHong Zhang for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 13279566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v,&x)); 13288a0c6efdSHong Zhang PetscFunctionReturn(0); 13298a0c6efdSHong Zhang } 13308a0c6efdSHong Zhang 133149b5e25fSSatish Balay ai = a->i; 133249b5e25fSSatish Balay aj = a->j; 133349b5e25fSSatish Balay bs2 = a->bs2; 13349566063dSJacob Faibussowitsch PetscCall(VecSet(v,zero)); 1335c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 13369566063dSJacob Faibussowitsch PetscCall(VecGetArray(v,&x)); 133749b5e25fSSatish Balay for (i=0; i<ambs; i++) { 133849b5e25fSSatish Balay j = ai[i]; 133949b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 134049b5e25fSSatish Balay row = i*bs; 134149b5e25fSSatish Balay aa_j = aa + j*bs2; 134249b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 134349b5e25fSSatish Balay } 134449b5e25fSSatish Balay } 13459566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v,&x)); 134649b5e25fSSatish Balay PetscFunctionReturn(0); 134749b5e25fSSatish Balay } 134849b5e25fSSatish Balay 1349dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 135049b5e25fSSatish Balay { 135149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1352d9ca1df4SBarry Smith PetscScalar x; 1353d9ca1df4SBarry Smith const PetscScalar *l,*li,*ri; 135449b5e25fSSatish Balay MatScalar *aa,*v; 1355fff8e43fSBarry Smith PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2; 1356fff8e43fSBarry Smith const PetscInt *ai,*aj; 1357ace3abfcSBarry Smith PetscBool flg; 135849b5e25fSSatish Balay 135949b5e25fSSatish Balay PetscFunctionBegin; 1360b3bf805bSHong Zhang if (ll != rr) { 13619566063dSJacob Faibussowitsch PetscCall(VecEqual(ll,rr,&flg)); 136228b400f6SJacob Faibussowitsch PetscCheck(flg,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same"); 1363b3bf805bSHong Zhang } 1364b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 136549b5e25fSSatish Balay ai = a->i; 136649b5e25fSSatish Balay aj = a->j; 136749b5e25fSSatish Balay aa = a->a; 1368d0f46423SBarry Smith m = A->rmap->N; 1369d0f46423SBarry Smith bs = A->rmap->bs; 137049b5e25fSSatish Balay mbs = a->mbs; 137149b5e25fSSatish Balay bs2 = a->bs2; 137249b5e25fSSatish Balay 13739566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(ll,&l)); 13749566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(ll,&lm)); 137508401ef6SPierre Jolivet PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 137649b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 137749b5e25fSSatish Balay M = ai[i+1] - ai[i]; 137849b5e25fSSatish Balay li = l + i*bs; 137949b5e25fSSatish Balay v = aa + bs2*ai[i]; 138049b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 138149b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 13825e90f9d9SHong Zhang for (k=0; k<bs; k++) { 138349b5e25fSSatish Balay x = ri[k]; 138449b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 138549b5e25fSSatish Balay } 138649b5e25fSSatish Balay } 138749b5e25fSSatish Balay } 13889566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(ll,&l)); 13899566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*a->nz)); 139049b5e25fSSatish Balay PetscFunctionReturn(0); 139149b5e25fSSatish Balay } 139249b5e25fSSatish Balay 1393dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 139449b5e25fSSatish Balay { 139549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 139649b5e25fSSatish Balay 139749b5e25fSSatish Balay PetscFunctionBegin; 139849b5e25fSSatish Balay info->block_size = a->bs2; 1399ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 14006c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 14013966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 140249b5e25fSSatish Balay info->assemblies = A->num_ass; 14038e58a170SBarry Smith info->mallocs = A->info.mallocs; 14047adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1405d5f3da31SBarry Smith if (A->factortype) { 140649b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 140749b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 140849b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 140949b5e25fSSatish Balay } else { 141049b5e25fSSatish Balay info->fill_ratio_given = 0; 141149b5e25fSSatish Balay info->fill_ratio_needed = 0; 141249b5e25fSSatish Balay info->factor_mallocs = 0; 141349b5e25fSSatish Balay } 141449b5e25fSSatish Balay PetscFunctionReturn(0); 141549b5e25fSSatish Balay } 141649b5e25fSSatish Balay 1417dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 141849b5e25fSSatish Balay { 141949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 142049b5e25fSSatish Balay 142149b5e25fSSatish Balay PetscFunctionBegin; 14229566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs])); 142349b5e25fSSatish Balay PetscFunctionReturn(0); 142449b5e25fSSatish Balay } 1425dc354874SHong Zhang 1426985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1427dc354874SHong Zhang { 1428dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1429d9ca1df4SBarry Smith PetscInt i,j,n,row,col,bs,mbs; 1430d9ca1df4SBarry Smith const PetscInt *ai,*aj; 1431c3fca9a7SHong Zhang PetscReal atmp; 1432d9ca1df4SBarry Smith const MatScalar *aa; 1433985db425SBarry Smith PetscScalar *x; 143413f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1435dc354874SHong Zhang 1436dc354874SHong Zhang PetscFunctionBegin; 143728b400f6SJacob Faibussowitsch PetscCheck(!idx,PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 143828b400f6SJacob Faibussowitsch PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1439d0f46423SBarry Smith bs = A->rmap->bs; 1440dc354874SHong Zhang aa = a->a; 1441dc354874SHong Zhang ai = a->i; 1442dc354874SHong Zhang aj = a->j; 144344117c81SHong Zhang mbs = a->mbs; 1444dc354874SHong Zhang 14459566063dSJacob Faibussowitsch PetscCall(VecSet(v,0.0)); 14469566063dSJacob Faibussowitsch PetscCall(VecGetArray(v,&x)); 14479566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(v,&n)); 144808401ef6SPierre Jolivet PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 144944117c81SHong Zhang for (i=0; i<mbs; i++) { 1450d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1451d0f6400bSHong Zhang brow = bs*i; 145244117c81SHong Zhang for (j=0; j<ncols; j++) { 1453d0f6400bSHong Zhang bcol = bs*(*aj); 145444117c81SHong Zhang for (kcol=0; kcol<bs; kcol++) { 1455d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 145644117c81SHong Zhang for (krow=0; krow<bs; krow++) { 1457d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1458d0f6400bSHong Zhang row = brow + krow; /* row index */ 1459c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1460c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 146144117c81SHong Zhang } 146244117c81SHong Zhang } 1463d0f6400bSHong Zhang aj++; 1464dc354874SHong Zhang } 1465dc354874SHong Zhang } 14669566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(v,&x)); 1467dc354874SHong Zhang PetscFunctionReturn(0); 1468dc354874SHong Zhang } 1469c2916339SPierre Jolivet 14704222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1471c2916339SPierre Jolivet { 1472c2916339SPierre Jolivet PetscFunctionBegin; 14739566063dSJacob Faibussowitsch PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); 14744222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1475c2916339SPierre Jolivet PetscFunctionReturn(0); 1476c2916339SPierre Jolivet } 1477c2916339SPierre Jolivet 1478c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1479c2916339SPierre Jolivet { 1480c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1481c2916339SPierre Jolivet PetscScalar *z = c; 1482c2916339SPierre Jolivet const PetscScalar *xb; 1483c2916339SPierre Jolivet PetscScalar x1; 1484c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1485c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 14863c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1487*b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 14883c2e41e1SStefano Zampini #else 14893c2e41e1SStefano Zampini const int aconj = 0; 14903c2e41e1SStefano Zampini #endif 1491c2916339SPierre Jolivet 1492c2916339SPierre Jolivet PetscFunctionBegin; 1493c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1494c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1495c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1496c2916339SPierre Jolivet PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1497c2916339SPierre Jolivet jj = idx; 1498c2916339SPierre Jolivet vv = v; 1499c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1500c2916339SPierre Jolivet idx = jj; 1501c2916339SPierre Jolivet v = vv; 1502c2916339SPierre Jolivet for (j=0; j<n; j++) { 1503c2916339SPierre Jolivet xb = b + (*idx); x1 = xb[0+k*bm]; 1504c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1; 15053c2e41e1SStefano Zampini if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm]; 1506c2916339SPierre Jolivet v += 1; 1507c2916339SPierre Jolivet ++idx; 1508c2916339SPierre Jolivet } 1509c2916339SPierre Jolivet } 1510c2916339SPierre Jolivet z += 1; 1511c2916339SPierre Jolivet } 1512c2916339SPierre Jolivet PetscFunctionReturn(0); 1513c2916339SPierre Jolivet } 1514c2916339SPierre Jolivet 1515c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1516c2916339SPierre Jolivet { 1517c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1518c2916339SPierre Jolivet PetscScalar *z = c; 1519c2916339SPierre Jolivet const PetscScalar *xb; 1520c2916339SPierre Jolivet PetscScalar x1,x2; 1521c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1522c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1523c2916339SPierre Jolivet 1524c2916339SPierre Jolivet PetscFunctionBegin; 1525c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1526c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1527c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1528c2916339SPierre Jolivet PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1529c2916339SPierre Jolivet jj = idx; 1530c2916339SPierre Jolivet vv = v; 1531c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1532c2916339SPierre Jolivet idx = jj; 1533c2916339SPierre Jolivet v = vv; 1534c2916339SPierre Jolivet for (j=0; j<n; j++) { 1535c2916339SPierre Jolivet xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 1536c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[2]*x2; 1537c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[3]*x2; 1538c2916339SPierre Jolivet if (*idx != i) { 1539c2916339SPierre Jolivet c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm]; 1540c2916339SPierre Jolivet c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm]; 1541c2916339SPierre Jolivet } 1542c2916339SPierre Jolivet v += 4; 1543c2916339SPierre Jolivet ++idx; 1544c2916339SPierre Jolivet } 1545c2916339SPierre Jolivet } 1546c2916339SPierre Jolivet z += 2; 1547c2916339SPierre Jolivet } 1548c2916339SPierre Jolivet PetscFunctionReturn(0); 1549c2916339SPierre Jolivet } 1550c2916339SPierre Jolivet 1551c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1552c2916339SPierre Jolivet { 1553c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1554c2916339SPierre Jolivet PetscScalar *z = c; 1555c2916339SPierre Jolivet const PetscScalar *xb; 1556c2916339SPierre Jolivet PetscScalar x1,x2,x3; 1557c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1558c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1559c2916339SPierre Jolivet 1560c2916339SPierre Jolivet PetscFunctionBegin; 1561c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1562c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1563c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1564c2916339SPierre Jolivet PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1565c2916339SPierre Jolivet jj = idx; 1566c2916339SPierre Jolivet vv = v; 1567c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1568c2916339SPierre Jolivet idx = jj; 1569c2916339SPierre Jolivet v = vv; 1570c2916339SPierre Jolivet for (j=0; j<n; j++) { 1571c2916339SPierre Jolivet xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 1572c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3; 1573c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3; 1574c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3; 1575c2916339SPierre Jolivet if (*idx != i) { 1576c2916339SPierre Jolivet c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm]; 1577c2916339SPierre Jolivet c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm]; 1578c2916339SPierre Jolivet c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm]; 1579c2916339SPierre Jolivet } 1580c2916339SPierre Jolivet v += 9; 1581c2916339SPierre Jolivet ++idx; 1582c2916339SPierre Jolivet } 1583c2916339SPierre Jolivet } 1584c2916339SPierre Jolivet z += 3; 1585c2916339SPierre Jolivet } 1586c2916339SPierre Jolivet PetscFunctionReturn(0); 1587c2916339SPierre Jolivet } 1588c2916339SPierre Jolivet 1589c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1590c2916339SPierre Jolivet { 1591c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1592c2916339SPierre Jolivet PetscScalar *z = c; 1593c2916339SPierre Jolivet const PetscScalar *xb; 1594c2916339SPierre Jolivet PetscScalar x1,x2,x3,x4; 1595c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1596c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1597c2916339SPierre Jolivet 1598c2916339SPierre Jolivet PetscFunctionBegin; 1599c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1600c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1601c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1602c2916339SPierre Jolivet PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1603c2916339SPierre Jolivet jj = idx; 1604c2916339SPierre Jolivet vv = v; 1605c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1606c2916339SPierre Jolivet idx = jj; 1607c2916339SPierre Jolivet v = vv; 1608c2916339SPierre Jolivet for (j=0; j<n; j++) { 1609c2916339SPierre Jolivet xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 1610c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1611c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1612c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1613c2916339SPierre Jolivet z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1614c2916339SPierre Jolivet if (*idx != i) { 1615c2916339SPierre Jolivet c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm]; 1616c2916339SPierre Jolivet c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm]; 1617c2916339SPierre Jolivet c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm]; 1618c2916339SPierre Jolivet c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm]; 1619c2916339SPierre Jolivet } 1620c2916339SPierre Jolivet v += 16; 1621c2916339SPierre Jolivet ++idx; 1622c2916339SPierre Jolivet } 1623c2916339SPierre Jolivet } 1624c2916339SPierre Jolivet z += 4; 1625c2916339SPierre Jolivet } 1626c2916339SPierre Jolivet PetscFunctionReturn(0); 1627c2916339SPierre Jolivet } 1628c2916339SPierre Jolivet 1629c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1630c2916339SPierre Jolivet { 1631c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1632c2916339SPierre Jolivet PetscScalar *z = c; 1633c2916339SPierre Jolivet const PetscScalar *xb; 1634c2916339SPierre Jolivet PetscScalar x1,x2,x3,x4,x5; 1635c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1636c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1637c2916339SPierre Jolivet 1638c2916339SPierre Jolivet PetscFunctionBegin; 1639c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1640c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1641c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1642c2916339SPierre Jolivet PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1643c2916339SPierre Jolivet jj = idx; 1644c2916339SPierre Jolivet vv = v; 1645c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1646c2916339SPierre Jolivet idx = jj; 1647c2916339SPierre Jolivet v = vv; 1648c2916339SPierre Jolivet for (j=0; j<n; j++) { 1649c2916339SPierre Jolivet xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm]; 1650c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1651c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1652c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1653c2916339SPierre Jolivet z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1654c2916339SPierre Jolivet z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1655c2916339SPierre Jolivet if (*idx != i) { 1656c2916339SPierre Jolivet c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm]; 1657c2916339SPierre Jolivet c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm]; 1658c2916339SPierre Jolivet c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm]; 1659c2916339SPierre Jolivet c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm]; 1660c2916339SPierre Jolivet c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm]; 1661c2916339SPierre Jolivet } 1662c2916339SPierre Jolivet v += 25; 1663c2916339SPierre Jolivet ++idx; 1664c2916339SPierre Jolivet } 1665c2916339SPierre Jolivet } 1666c2916339SPierre Jolivet z += 5; 1667c2916339SPierre Jolivet } 1668c2916339SPierre Jolivet PetscFunctionReturn(0); 1669c2916339SPierre Jolivet } 1670c2916339SPierre Jolivet 1671c2916339SPierre Jolivet PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C) 1672c2916339SPierre Jolivet { 1673c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1674c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1675281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 1676c2916339SPierre Jolivet PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 1677c2916339SPierre Jolivet PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1678c2916339SPierre Jolivet PetscBLASInt bbs,bcn,bbm,bcm; 1679f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1680c2916339SPierre Jolivet PetscScalar *c,*b; 1681c2916339SPierre Jolivet const MatScalar *v; 1682c2916339SPierre Jolivet const PetscInt *idx,*ii; 1683c2916339SPierre Jolivet PetscScalar _DOne=1.0; 1684c2916339SPierre Jolivet 1685c2916339SPierre Jolivet PetscFunctionBegin; 1686c2916339SPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 168708401ef6SPierre Jolivet PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 168808401ef6SPierre Jolivet PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 168908401ef6SPierre Jolivet PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 1690c2916339SPierre Jolivet b = bd->v; 16919566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(C)); 16929566063dSJacob Faibussowitsch PetscCall(MatDenseGetArray(C,&c)); 1693c2916339SPierre Jolivet switch (bs) { 1694c2916339SPierre Jolivet case 1: 16959566063dSJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1696c2916339SPierre Jolivet break; 1697c2916339SPierre Jolivet case 2: 16989566063dSJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1699c2916339SPierre Jolivet break; 1700c2916339SPierre Jolivet case 3: 17019566063dSJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1702c2916339SPierre Jolivet break; 1703c2916339SPierre Jolivet case 4: 17049566063dSJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1705c2916339SPierre Jolivet break; 1706c2916339SPierre Jolivet case 5: 17079566063dSJacob Faibussowitsch PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1708c2916339SPierre Jolivet break; 1709c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 17109566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bs,&bbs)); 17119566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cn,&bcn)); 17129566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(bm,&bbm)); 17139566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(cm,&bcm)); 1714c2916339SPierre Jolivet idx = a->j; 1715c2916339SPierre Jolivet v = a->a; 1716c2916339SPierre Jolivet mbs = a->mbs; 1717c2916339SPierre Jolivet ii = a->i; 1718c2916339SPierre Jolivet z = c; 1719c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1720c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1721c2916339SPierre Jolivet for (j=0; j<n; j++) { 17226718818eSStefano Zampini if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm)); 1723c2916339SPierre Jolivet PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 1724c2916339SPierre Jolivet v += bs2; 1725c2916339SPierre Jolivet } 1726c2916339SPierre Jolivet z += bs; 1727c2916339SPierre Jolivet } 1728c2916339SPierre Jolivet } 17299566063dSJacob Faibussowitsch PetscCall(MatDenseRestoreArray(C,&c)); 17309566063dSJacob Faibussowitsch PetscCall(PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn)); 1731c2916339SPierre Jolivet PetscFunctionReturn(0); 1732c2916339SPierre Jolivet } 1733