149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h> 6c6db04a5SJed Brown #include <petscblaslapack.h> 749b5e25fSSatish Balay 84a2ae208SSatish Balay #undef __FUNCT__ 94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 1149b5e25fSSatish Balay { 125eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 136849ba73SBarry Smith PetscErrorCode ierr; 145d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 155d0c19d7SBarry Smith const PetscInt *idx; 16db41cccfSHong Zhang PetscBT table_out,table_in; 17d94109b8SHong Zhang 18d94109b8SHong Zhang PetscFunctionBegin; 19e32f2f54SBarry Smith if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20d94109b8SHong Zhang mbs = a->mbs; 21d94109b8SHong Zhang ai = a->i; 22d94109b8SHong Zhang aj = a->j; 23d0f46423SBarry Smith bs = A->rmap->bs; 2453b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25854ce69bSBarry Smith ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 2753b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28d94109b8SHong Zhang 29d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 30d94109b8SHong Zhang isz = 0; 31db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32d94109b8SHong Zhang 33d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 34d94109b8SHong Zhang ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35d94109b8SHong Zhang ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36d94109b8SHong Zhang 37db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38dbe03f88SHong Zhang bcol_max = 0; 39d94109b8SHong Zhang for (j=0; j<n; ++j) { 40d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 41e32f2f54SBarry Smith if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) { 43dbe03f88SHong Zhang nidx[isz++] = brow; 44dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 45dbe03f88SHong Zhang } 46d94109b8SHong Zhang } 47d94109b8SHong Zhang ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 486bf464f9SBarry Smith ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49d94109b8SHong Zhang 50d94109b8SHong Zhang k = 0; 51d94109b8SHong Zhang for (j=0; j<ov; j++) { /* for each overlap */ 52db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54db41cccfSHong Zhang for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55d94109b8SHong Zhang 56d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 57d94109b8SHong Zhang for (brow=0; brow<mbs; brow++) { 58d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 59db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60d94109b8SHong Zhang for (l = start; l<end; l++) { 61d94109b8SHong Zhang bcol = aj[l]; 62d7b97159SHong Zhang if (!PetscBTLookupSet(table_out,bcol)) { 63d7b97159SHong Zhang nidx[isz++] = bcol; 64d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 65d7b97159SHong Zhang } 66d94109b8SHong Zhang } 67d94109b8SHong Zhang k++; 68d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70d94109b8SHong Zhang for (l = start; l<end; l++) { 71d94109b8SHong Zhang bcol = aj[l]; 72dbe03f88SHong Zhang if (bcol > bcol_max) break; 73db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)) { 7426fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } 79d94109b8SHong Zhang } 80d94109b8SHong Zhang } /* for each overlap */ 81d94109b8SHong Zhang 82d94109b8SHong Zhang /* expand the Index Set */ 83d94109b8SHong Zhang for (j=0; j<isz; j++) { 8426fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85d94109b8SHong Zhang } 8670b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87d94109b8SHong Zhang } 8894bacf5dSBarry Smith ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89d94109b8SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 90d94109b8SHong Zhang ierr = PetscFree(nidx2);CHKERRQ(ierr); 9194bacf5dSBarry Smith ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 925eee224dSHong Zhang PetscFunctionReturn(0); 9349b5e25fSSatish Balay } 9449b5e25fSSatish Balay 95*847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 96*847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 974a2ae208SSatish Balay #undef __FUNCT__ 98*847daeccSHong Zhang #define __FUNCT__ "MatSeqSBAIJZeroOps_Private" 99*847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 10049b5e25fSSatish Balay { 1016849ba73SBarry Smith PetscErrorCode ierr; 10249b5e25fSSatish Balay 10349b5e25fSSatish Balay PetscFunctionBegin; 104*847daeccSHong Zhang ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 105*847daeccSHong Zhang Bseq->ops->mult = 0; 106*847daeccSHong Zhang Bseq->ops->multadd = 0; 107*847daeccSHong Zhang Bseq->ops->multtranspose = 0; 108*847daeccSHong Zhang Bseq->ops->multtransposeadd = 0; 109*847daeccSHong Zhang Bseq->ops->lufactor = 0; 110*847daeccSHong Zhang Bseq->ops->choleskyfactor = 0; 111*847daeccSHong Zhang Bseq->ops->lufactorsymbolic = 0; 112*847daeccSHong Zhang Bseq->ops->choleskyfactorsymbolic = 0; 113*847daeccSHong Zhang Bseq->ops->getinertia = 0; 11449b5e25fSSatish Balay PetscFunctionReturn(0); 11549b5e25fSSatish Balay } 11649b5e25fSSatish Balay 1174a2ae208SSatish Balay #undef __FUNCT__ 1184a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 1194aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 12049b5e25fSSatish Balay { 1216849ba73SBarry Smith PetscErrorCode ierr; 12249b5e25fSSatish Balay 12349b5e25fSSatish Balay PetscFunctionBegin; 124*847daeccSHong Zhang ierr = MatGetSubMatrix_SeqBAIJ(A,isrow,iscol,scall,B);CHKERRQ(ierr); 125*847daeccSHong Zhang 1268f46ffcaSHong Zhang if (isrow != iscol) { 1278f46ffcaSHong Zhang PetscBool isequal; 1288f46ffcaSHong Zhang ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 129*847daeccSHong Zhang if (!isequal) { 130*847daeccSHong Zhang ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 1318f46ffcaSHong Zhang } 13249b5e25fSSatish Balay } 13349b5e25fSSatish Balay PetscFunctionReturn(0); 13449b5e25fSSatish Balay } 13549b5e25fSSatish Balay 1364a2ae208SSatish Balay #undef __FUNCT__ 1374a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 13813f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 13949b5e25fSSatish Balay { 1406849ba73SBarry Smith PetscErrorCode ierr; 14113f74950SBarry Smith PetscInt i; 142afebec48SHong Zhang PetscBool flg; 14349b5e25fSSatish Balay 14449b5e25fSSatish Balay PetscFunctionBegin; 145afebec48SHong Zhang ierr = MatGetSubMatrices_SeqBAIJ(A,n,irow,icol,scall,B);CHKERRQ(ierr); 14649b5e25fSSatish Balay for (i=0; i<n; i++) { 147afebec48SHong Zhang ierr = ISEqual(irow[i],icol[i],&flg);CHKERRQ(ierr); 148afebec48SHong Zhang if (!flg) { 149*847daeccSHong Zhang ierr = MatSeqSBAIJZeroOps_Private(*B[i]);CHKERRQ(ierr); 150afebec48SHong Zhang } 15149b5e25fSSatish Balay } 15249b5e25fSSatish Balay PetscFunctionReturn(0); 15349b5e25fSSatish Balay } 15449b5e25fSSatish Balay 15549b5e25fSSatish Balay /* -------------------------------------------------------*/ 15649b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 15749b5e25fSSatish Balay /* -------------------------------------------------------*/ 15849b5e25fSSatish Balay 1594a2ae208SSatish Balay #undef __FUNCT__ 1604a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2" 161dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 16249b5e25fSSatish Balay { 16349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 164d9ca1df4SBarry Smith PetscScalar *z,x1,x2,zero=0.0; 165d9ca1df4SBarry Smith const PetscScalar *x,*xb; 166d9ca1df4SBarry Smith const MatScalar *v; 1676849ba73SBarry Smith PetscErrorCode ierr; 168d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 169d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 17098c9bda7SSatish Balay PetscInt nonzerorow=0; 17149b5e25fSSatish Balay 17249b5e25fSSatish Balay PetscFunctionBegin; 1732dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 174d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1751ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 17649b5e25fSSatish Balay 17749b5e25fSSatish Balay v = a->a; 17849b5e25fSSatish Balay xb = x; 17949b5e25fSSatish Balay 18049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 18149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 18249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 18349b5e25fSSatish Balay ib = aj + *ai; 184831a3094SHong Zhang jmin = 0; 18598c9bda7SSatish Balay nonzerorow += (n>0); 1867fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 18749b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 18849b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 189831a3094SHong Zhang v += 4; jmin++; 1907fbae186SHong Zhang } 191444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 192444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 193831a3094SHong Zhang for (j=jmin; j<n; j++) { 19449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 19549b5e25fSSatish Balay cval = ib[j]*2; 19649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 19749b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 19849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 19949b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 20049b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 20149b5e25fSSatish Balay v += 4; 20249b5e25fSSatish Balay } 20349b5e25fSSatish Balay xb +=2; ai++; 20449b5e25fSSatish Balay } 20549b5e25fSSatish Balay 206d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2071ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 208dc0b31edSSatish Balay ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 20949b5e25fSSatish Balay PetscFunctionReturn(0); 21049b5e25fSSatish Balay } 21149b5e25fSSatish Balay 2124a2ae208SSatish Balay #undef __FUNCT__ 2134a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3" 214dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 21549b5e25fSSatish Balay { 21649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 217d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,zero=0.0; 218d9ca1df4SBarry Smith const PetscScalar *x,*xb; 219d9ca1df4SBarry Smith const MatScalar *v; 2206849ba73SBarry Smith PetscErrorCode ierr; 221d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 222d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 22398c9bda7SSatish Balay PetscInt nonzerorow=0; 22449b5e25fSSatish Balay 22549b5e25fSSatish Balay PetscFunctionBegin; 2262dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 227d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2281ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 22949b5e25fSSatish Balay 23049b5e25fSSatish Balay v = a->a; 23149b5e25fSSatish Balay xb = x; 23249b5e25fSSatish Balay 23349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 23449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 23549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 23649b5e25fSSatish Balay ib = aj + *ai; 237831a3094SHong Zhang jmin = 0; 23898c9bda7SSatish Balay nonzerorow += (n>0); 2397fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 24049b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 24149b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 24249b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 243831a3094SHong Zhang v += 9; jmin++; 2447fbae186SHong Zhang } 245444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 246444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 247831a3094SHong Zhang for (j=jmin; j<n; j++) { 24849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 24949b5e25fSSatish Balay cval = ib[j]*3; 25049b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 25149b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 25249b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 25349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 25449b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 25549b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 25649b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 25749b5e25fSSatish Balay v += 9; 25849b5e25fSSatish Balay } 25949b5e25fSSatish Balay xb +=3; ai++; 26049b5e25fSSatish Balay } 26149b5e25fSSatish Balay 262d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2631ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 264dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 26549b5e25fSSatish Balay PetscFunctionReturn(0); 26649b5e25fSSatish Balay } 26749b5e25fSSatish Balay 2684a2ae208SSatish Balay #undef __FUNCT__ 2694a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4" 270dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 27149b5e25fSSatish Balay { 27249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 273d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,zero=0.0; 274d9ca1df4SBarry Smith const PetscScalar *x,*xb; 275d9ca1df4SBarry Smith const MatScalar *v; 2766849ba73SBarry Smith PetscErrorCode ierr; 277d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 278d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 27998c9bda7SSatish Balay PetscInt nonzerorow = 0; 28049b5e25fSSatish Balay 28149b5e25fSSatish Balay PetscFunctionBegin; 2822dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 283d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2841ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 28549b5e25fSSatish Balay 28649b5e25fSSatish Balay v = a->a; 28749b5e25fSSatish Balay xb = x; 28849b5e25fSSatish Balay 28949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 29049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 29149b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 29249b5e25fSSatish Balay ib = aj + *ai; 293831a3094SHong Zhang jmin = 0; 29498c9bda7SSatish Balay nonzerorow += (n>0); 2957fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 29649b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 29749b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 29849b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 29949b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 300831a3094SHong Zhang v += 16; jmin++; 3017fbae186SHong Zhang } 302444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 303444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 304831a3094SHong Zhang for (j=jmin; j<n; j++) { 30549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 30649b5e25fSSatish Balay cval = ib[j]*4; 30749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 30849b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 30949b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 31049b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 31149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 31249b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 31349b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 31449b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 31549b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 31649b5e25fSSatish Balay v += 16; 31749b5e25fSSatish Balay } 31849b5e25fSSatish Balay xb +=4; ai++; 31949b5e25fSSatish Balay } 32049b5e25fSSatish Balay 321d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3221ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 323dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 32449b5e25fSSatish Balay PetscFunctionReturn(0); 32549b5e25fSSatish Balay } 32649b5e25fSSatish Balay 3274a2ae208SSatish Balay #undef __FUNCT__ 3284a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5" 329dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 33049b5e25fSSatish Balay { 33149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 332d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 333d9ca1df4SBarry Smith const PetscScalar *x,*xb; 334d9ca1df4SBarry Smith const MatScalar *v; 3356849ba73SBarry Smith PetscErrorCode ierr; 336d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 337d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 33898c9bda7SSatish Balay PetscInt nonzerorow=0; 33949b5e25fSSatish Balay 34049b5e25fSSatish Balay PetscFunctionBegin; 3412dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 342d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3431ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 34449b5e25fSSatish Balay 34549b5e25fSSatish Balay v = a->a; 34649b5e25fSSatish Balay xb = x; 34749b5e25fSSatish Balay 34849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 34949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 35049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 35149b5e25fSSatish Balay ib = aj + *ai; 352831a3094SHong Zhang jmin = 0; 35398c9bda7SSatish Balay nonzerorow += (n>0); 3547fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 35549b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 35649b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 35749b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 35849b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 35949b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 360831a3094SHong Zhang v += 25; jmin++; 3617fbae186SHong Zhang } 362444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 363444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 364831a3094SHong Zhang for (j=jmin; j<n; j++) { 36549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 36649b5e25fSSatish Balay cval = ib[j]*5; 36749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 36849b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 36949b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 37049b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 37149b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 37249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 37349b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 37449b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 37549b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 37649b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 37749b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 37849b5e25fSSatish Balay v += 25; 37949b5e25fSSatish Balay } 38049b5e25fSSatish Balay xb +=5; ai++; 38149b5e25fSSatish Balay } 38249b5e25fSSatish Balay 383d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3841ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 385dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 38649b5e25fSSatish Balay PetscFunctionReturn(0); 38749b5e25fSSatish Balay } 38849b5e25fSSatish Balay 38949b5e25fSSatish Balay 3904a2ae208SSatish Balay #undef __FUNCT__ 3914a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6" 392dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 39349b5e25fSSatish Balay { 39449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 395d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 396d9ca1df4SBarry Smith const PetscScalar *x,*xb; 397d9ca1df4SBarry Smith const MatScalar *v; 3986849ba73SBarry Smith PetscErrorCode ierr; 399d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 400d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 40198c9bda7SSatish Balay PetscInt nonzerorow=0; 40249b5e25fSSatish Balay 40349b5e25fSSatish Balay PetscFunctionBegin; 4042dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 405d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4061ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 40749b5e25fSSatish Balay 40849b5e25fSSatish Balay v = a->a; 40949b5e25fSSatish Balay xb = x; 41049b5e25fSSatish Balay 41149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 41249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 41349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 41449b5e25fSSatish Balay ib = aj + *ai; 415831a3094SHong Zhang jmin = 0; 41698c9bda7SSatish Balay nonzerorow += (n>0); 4177fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 41849b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 41949b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 42049b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 42149b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 42249b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 42349b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 424831a3094SHong Zhang v += 36; jmin++; 4257fbae186SHong Zhang } 426444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 427444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 428831a3094SHong Zhang for (j=jmin; j<n; j++) { 42949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 43049b5e25fSSatish Balay cval = ib[j]*6; 43149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 43249b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 43349b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 43449b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 43549b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 43649b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 43749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 43849b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 43949b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 44049b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 44149b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 44249b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 44349b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 44449b5e25fSSatish Balay v += 36; 44549b5e25fSSatish Balay } 44649b5e25fSSatish Balay xb +=6; ai++; 44749b5e25fSSatish Balay } 44849b5e25fSSatish Balay 449d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4501ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 451dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 45249b5e25fSSatish Balay PetscFunctionReturn(0); 45349b5e25fSSatish Balay } 4544a2ae208SSatish Balay #undef __FUNCT__ 4554a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7" 456dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 45749b5e25fSSatish Balay { 45849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 459d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 460d9ca1df4SBarry Smith const PetscScalar *x,*xb; 461d9ca1df4SBarry Smith const MatScalar *v; 4626849ba73SBarry Smith PetscErrorCode ierr; 463d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 464d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 46598c9bda7SSatish Balay PetscInt nonzerorow=0; 46649b5e25fSSatish Balay 46749b5e25fSSatish Balay PetscFunctionBegin; 4682dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 469d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4701ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 47149b5e25fSSatish Balay 47249b5e25fSSatish Balay v = a->a; 47349b5e25fSSatish Balay xb = x; 47449b5e25fSSatish Balay 47549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 47649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 47749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 47849b5e25fSSatish Balay ib = aj + *ai; 479831a3094SHong Zhang jmin = 0; 48098c9bda7SSatish Balay nonzerorow += (n>0); 4817fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 48249b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 48349b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 48449b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 48549b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 48649b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 48749b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 48849b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 489831a3094SHong Zhang v += 49; jmin++; 4907fbae186SHong Zhang } 491444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 492444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 493831a3094SHong Zhang for (j=jmin; j<n; j++) { 49449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 49549b5e25fSSatish Balay cval = ib[j]*7; 49649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 49749b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 49849b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 49949b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 50049b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 50149b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 50249b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 50349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 50449b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 50549b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 50649b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 50749b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 50849b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 50949b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 51049b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 51149b5e25fSSatish Balay v += 49; 51249b5e25fSSatish Balay } 51349b5e25fSSatish Balay xb +=7; ai++; 51449b5e25fSSatish Balay } 515d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5161ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 517dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 51849b5e25fSSatish Balay PetscFunctionReturn(0); 51949b5e25fSSatish Balay } 52049b5e25fSSatish Balay 52149b5e25fSSatish Balay /* 52249b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 52349b5e25fSSatish Balay */ 5244a2ae208SSatish Balay #undef __FUNCT__ 5254a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N" 526dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 52749b5e25fSSatish Balay { 52849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 529d9ca1df4SBarry Smith PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 530d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 531d9ca1df4SBarry Smith const MatScalar *v; 532dfbe8321SBarry Smith PetscErrorCode ierr; 533d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 534d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 53598c9bda7SSatish Balay PetscInt nonzerorow=0; 53649b5e25fSSatish Balay 53749b5e25fSSatish Balay PetscFunctionBegin; 5382dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 539d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 5401ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 54149b5e25fSSatish Balay 54249b5e25fSSatish Balay aj = a->j; 54349b5e25fSSatish Balay v = a->a; 54449b5e25fSSatish Balay ii = a->i; 54549b5e25fSSatish Balay 54649b5e25fSSatish Balay if (!a->mult_work) { 547854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 54849b5e25fSSatish Balay } 54949b5e25fSSatish Balay work = a->mult_work; 55049b5e25fSSatish Balay 55149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 55249b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 55349b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 55498c9bda7SSatish Balay nonzerorow += (n>0); 55549b5e25fSSatish Balay 55649b5e25fSSatish Balay /* upper triangular part */ 55749b5e25fSSatish Balay for (j=0; j<n; j++) { 55849b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 55949b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 56049b5e25fSSatish Balay workt += bs; 56149b5e25fSSatish Balay } 56249b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 56396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 56449b5e25fSSatish Balay 56549b5e25fSSatish Balay /* strict lower triangular part */ 566831a3094SHong Zhang idx = aj+ii[0]; 567831a3094SHong Zhang if (*idx == i) { 56896b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 569831a3094SHong Zhang } 57096b9376eSHong Zhang 57149b5e25fSSatish Balay if (ncols > 0) { 57249b5e25fSSatish Balay workt = work; 57387828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 57496b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 575831a3094SHong Zhang for (j=0; j<n; j++) { 576831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 57749b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 57849b5e25fSSatish Balay workt += bs; 57949b5e25fSSatish Balay } 58049b5e25fSSatish Balay } 58149b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 58249b5e25fSSatish Balay } 58349b5e25fSSatish Balay 584d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5851ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 586dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 58749b5e25fSSatish Balay PetscFunctionReturn(0); 58849b5e25fSSatish Balay } 58949b5e25fSSatish Balay 5904a2ae208SSatish Balay #undef __FUNCT__ 5914a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 592dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 59349b5e25fSSatish Balay { 59449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 595d9ca1df4SBarry Smith PetscScalar *z,x1; 596d9ca1df4SBarry Smith const PetscScalar *x,*xb; 597d9ca1df4SBarry Smith const MatScalar *v; 5986849ba73SBarry Smith PetscErrorCode ierr; 599d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 600d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 60198c9bda7SSatish Balay PetscInt nonzerorow=0; 60249b5e25fSSatish Balay 60349b5e25fSSatish Balay PetscFunctionBegin; 604343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 605d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 6061ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 60749b5e25fSSatish Balay v = a->a; 60849b5e25fSSatish Balay xb = x; 60949b5e25fSSatish Balay 61049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 61149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 61249b5e25fSSatish Balay x1 = xb[0]; 61349b5e25fSSatish Balay ib = aj + *ai; 614831a3094SHong Zhang jmin = 0; 61598c9bda7SSatish Balay nonzerorow += (n>0); 616831a3094SHong Zhang if (*ib == i) { /* (diag of A)*x */ 617831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 618831a3094SHong Zhang } 619831a3094SHong Zhang for (j=jmin; j<n; j++) { 62049b5e25fSSatish Balay cval = *ib; 62149b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 62249b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 62349b5e25fSSatish Balay } 62449b5e25fSSatish Balay xb++; ai++; 62549b5e25fSSatish Balay } 62649b5e25fSSatish Balay 627d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 628bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 62949b5e25fSSatish Balay 630dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 63149b5e25fSSatish Balay PetscFunctionReturn(0); 63249b5e25fSSatish Balay } 63349b5e25fSSatish Balay 6344a2ae208SSatish Balay #undef __FUNCT__ 6354a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 636dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 63749b5e25fSSatish Balay { 63849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 639d9ca1df4SBarry Smith PetscScalar *z,x1,x2; 640d9ca1df4SBarry Smith const PetscScalar *x,*xb; 641d9ca1df4SBarry Smith const MatScalar *v; 6426849ba73SBarry Smith PetscErrorCode ierr; 643d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 644d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 64598c9bda7SSatish Balay PetscInt nonzerorow=0; 64649b5e25fSSatish Balay 64749b5e25fSSatish Balay PetscFunctionBegin; 648343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 649d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 6501ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 65149b5e25fSSatish Balay 65249b5e25fSSatish Balay v = a->a; 65349b5e25fSSatish Balay xb = x; 65449b5e25fSSatish Balay 65549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 65649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 65749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 65849b5e25fSSatish Balay ib = aj + *ai; 659831a3094SHong Zhang jmin = 0; 66098c9bda7SSatish Balay nonzerorow += (n>0); 6617fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 66249b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 66349b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 664831a3094SHong Zhang v += 4; jmin++; 6657fbae186SHong Zhang } 666444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 667444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 668831a3094SHong Zhang for (j=jmin; j<n; j++) { 66949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 67049b5e25fSSatish Balay cval = ib[j]*2; 67149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 67249b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 67349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 67449b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 67549b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 67649b5e25fSSatish Balay v += 4; 67749b5e25fSSatish Balay } 67849b5e25fSSatish Balay xb +=2; ai++; 67949b5e25fSSatish Balay } 680d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 681bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 68249b5e25fSSatish Balay 683dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 68449b5e25fSSatish Balay PetscFunctionReturn(0); 68549b5e25fSSatish Balay } 68649b5e25fSSatish Balay 6874a2ae208SSatish Balay #undef __FUNCT__ 6884a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 689dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 69049b5e25fSSatish Balay { 69149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 692d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3; 693d9ca1df4SBarry Smith const PetscScalar *x,*xb; 694d9ca1df4SBarry Smith const MatScalar *v; 6956849ba73SBarry Smith PetscErrorCode ierr; 696d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 697d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 69898c9bda7SSatish Balay PetscInt nonzerorow=0; 69949b5e25fSSatish Balay 70049b5e25fSSatish Balay PetscFunctionBegin; 701343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 702d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7031ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 70449b5e25fSSatish Balay 70549b5e25fSSatish Balay v = a->a; 70649b5e25fSSatish Balay xb = x; 70749b5e25fSSatish Balay 70849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 70949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 71049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 71149b5e25fSSatish Balay ib = aj + *ai; 712831a3094SHong Zhang jmin = 0; 71398c9bda7SSatish Balay nonzerorow += (n>0); 7147fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 71549b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 71649b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 71749b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 718831a3094SHong Zhang v += 9; jmin++; 7197fbae186SHong Zhang } 720444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 721444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 722831a3094SHong Zhang for (j=jmin; j<n; j++) { 72349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 72449b5e25fSSatish Balay cval = ib[j]*3; 72549b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 72649b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 72749b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 72849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 72949b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 73049b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 73149b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 73249b5e25fSSatish Balay v += 9; 73349b5e25fSSatish Balay } 73449b5e25fSSatish Balay xb +=3; ai++; 73549b5e25fSSatish Balay } 73649b5e25fSSatish Balay 737d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 738bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 73949b5e25fSSatish Balay 740dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 74149b5e25fSSatish Balay PetscFunctionReturn(0); 74249b5e25fSSatish Balay } 74349b5e25fSSatish Balay 7444a2ae208SSatish Balay #undef __FUNCT__ 7454a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 746dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 74749b5e25fSSatish Balay { 74849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 749d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4; 750d9ca1df4SBarry Smith const PetscScalar *x,*xb; 751d9ca1df4SBarry Smith const MatScalar *v; 7526849ba73SBarry Smith PetscErrorCode ierr; 753d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 754d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 75598c9bda7SSatish Balay PetscInt nonzerorow=0; 75649b5e25fSSatish Balay 75749b5e25fSSatish Balay PetscFunctionBegin; 758343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 759d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7601ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 76149b5e25fSSatish Balay 76249b5e25fSSatish Balay v = a->a; 76349b5e25fSSatish Balay xb = x; 76449b5e25fSSatish Balay 76549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 76649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 76749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 76849b5e25fSSatish Balay ib = aj + *ai; 769831a3094SHong Zhang jmin = 0; 77098c9bda7SSatish Balay nonzerorow += (n>0); 7717fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 77249b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 77349b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 77449b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 77549b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 776831a3094SHong Zhang v += 16; jmin++; 7777fbae186SHong Zhang } 778444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 779444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 780831a3094SHong Zhang for (j=jmin; j<n; j++) { 78149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 78249b5e25fSSatish Balay cval = ib[j]*4; 78349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 78449b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 78549b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 78649b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 78749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 78849b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 78949b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 79049b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 79149b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 79249b5e25fSSatish Balay v += 16; 79349b5e25fSSatish Balay } 79449b5e25fSSatish Balay xb +=4; ai++; 79549b5e25fSSatish Balay } 79649b5e25fSSatish Balay 797d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 798bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 79949b5e25fSSatish Balay 800dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 80149b5e25fSSatish Balay PetscFunctionReturn(0); 80249b5e25fSSatish Balay } 80349b5e25fSSatish Balay 8044a2ae208SSatish Balay #undef __FUNCT__ 8054a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 806dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 80749b5e25fSSatish Balay { 80849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 809d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5; 810d9ca1df4SBarry Smith const PetscScalar *x,*xb; 811d9ca1df4SBarry Smith const MatScalar *v; 8126849ba73SBarry Smith PetscErrorCode ierr; 813d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 814d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 81598c9bda7SSatish Balay PetscInt nonzerorow=0; 81649b5e25fSSatish Balay 81749b5e25fSSatish Balay PetscFunctionBegin; 818343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 819d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8201ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 82149b5e25fSSatish Balay 82249b5e25fSSatish Balay v = a->a; 82349b5e25fSSatish Balay xb = x; 82449b5e25fSSatish Balay 82549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 82649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 82749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 82849b5e25fSSatish Balay ib = aj + *ai; 829831a3094SHong Zhang jmin = 0; 83098c9bda7SSatish Balay nonzerorow += (n>0); 8317fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 83249b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 83349b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 83449b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 83549b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 83649b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 837831a3094SHong Zhang v += 25; jmin++; 8387fbae186SHong Zhang } 839444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 840444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 841831a3094SHong Zhang for (j=jmin; j<n; j++) { 84249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 84349b5e25fSSatish Balay cval = ib[j]*5; 84449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 84549b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 84649b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 84749b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 84849b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 84949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 85049b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 85149b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 85249b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 85349b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 85449b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 85549b5e25fSSatish Balay v += 25; 85649b5e25fSSatish Balay } 85749b5e25fSSatish Balay xb +=5; ai++; 85849b5e25fSSatish Balay } 85949b5e25fSSatish Balay 860d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 861bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 86249b5e25fSSatish Balay 863dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 86449b5e25fSSatish Balay PetscFunctionReturn(0); 86549b5e25fSSatish Balay } 8664a2ae208SSatish Balay #undef __FUNCT__ 8674a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 868dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 86949b5e25fSSatish Balay { 87049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 871d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6; 872d9ca1df4SBarry Smith const PetscScalar *x,*xb; 873d9ca1df4SBarry Smith const MatScalar *v; 8746849ba73SBarry Smith PetscErrorCode ierr; 875d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 876d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 87798c9bda7SSatish Balay PetscInt nonzerorow=0; 87849b5e25fSSatish Balay 87949b5e25fSSatish Balay PetscFunctionBegin; 880343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 881d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8821ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 88349b5e25fSSatish Balay 88449b5e25fSSatish Balay v = a->a; 88549b5e25fSSatish Balay xb = x; 88649b5e25fSSatish Balay 88749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 88849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 88949b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 89049b5e25fSSatish Balay ib = aj + *ai; 891831a3094SHong Zhang jmin = 0; 89298c9bda7SSatish Balay nonzerorow += (n>0); 8937fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 89449b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 89549b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 89649b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 89749b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 89849b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 89949b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 900831a3094SHong Zhang v += 36; jmin++; 9017fbae186SHong Zhang } 902444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 903444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 904831a3094SHong Zhang for (j=jmin; j<n; j++) { 90549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 90649b5e25fSSatish Balay cval = ib[j]*6; 90749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 90849b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 90949b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 91049b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 91149b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 91249b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 91349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 91449b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 91549b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 91649b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 91749b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 91849b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 91949b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 92049b5e25fSSatish Balay v += 36; 92149b5e25fSSatish Balay } 92249b5e25fSSatish Balay xb +=6; ai++; 92349b5e25fSSatish Balay } 92449b5e25fSSatish Balay 925d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 926bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 92749b5e25fSSatish Balay 928dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 92949b5e25fSSatish Balay PetscFunctionReturn(0); 93049b5e25fSSatish Balay } 93149b5e25fSSatish Balay 9324a2ae208SSatish Balay #undef __FUNCT__ 9334a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 934dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 93549b5e25fSSatish Balay { 93649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 937d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 938d9ca1df4SBarry Smith const PetscScalar *x,*xb; 939d9ca1df4SBarry Smith const MatScalar *v; 9406849ba73SBarry Smith PetscErrorCode ierr; 941d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 942d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 94398c9bda7SSatish Balay PetscInt nonzerorow=0; 94449b5e25fSSatish Balay 94549b5e25fSSatish Balay PetscFunctionBegin; 946343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 947d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9481ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 94949b5e25fSSatish Balay 95049b5e25fSSatish Balay v = a->a; 95149b5e25fSSatish Balay xb = x; 95249b5e25fSSatish Balay 95349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 95449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 95549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 95649b5e25fSSatish Balay ib = aj + *ai; 957831a3094SHong Zhang jmin = 0; 95898c9bda7SSatish Balay nonzerorow += (n>0); 9597fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 96049b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 96149b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 96249b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 96349b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 96449b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 96549b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 96649b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 967831a3094SHong Zhang v += 49; jmin++; 9687fbae186SHong Zhang } 969444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 970444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 971831a3094SHong Zhang for (j=jmin; j<n; j++) { 97249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 97349b5e25fSSatish Balay cval = ib[j]*7; 97449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 97549b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 97649b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 97749b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 97849b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 97949b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 98049b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 98149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 98249b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 98349b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 98449b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 98549b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 98649b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 98749b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 98849b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 98949b5e25fSSatish Balay v += 49; 99049b5e25fSSatish Balay } 99149b5e25fSSatish Balay xb +=7; ai++; 99249b5e25fSSatish Balay } 99349b5e25fSSatish Balay 994d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 995bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 99649b5e25fSSatish Balay 997dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 99849b5e25fSSatish Balay PetscFunctionReturn(0); 99949b5e25fSSatish Balay } 100049b5e25fSSatish Balay 10014a2ae208SSatish Balay #undef __FUNCT__ 10024a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1003dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 100449b5e25fSSatish Balay { 100549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1006d9ca1df4SBarry Smith PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1007d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 1008d9ca1df4SBarry Smith const MatScalar *v; 1009dfbe8321SBarry Smith PetscErrorCode ierr; 1010d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1011d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 101298c9bda7SSatish Balay PetscInt nonzerorow=0; 101349b5e25fSSatish Balay 101449b5e25fSSatish Balay PetscFunctionBegin; 1015343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1016d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 10171ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 101849b5e25fSSatish Balay 101949b5e25fSSatish Balay aj = a->j; 102049b5e25fSSatish Balay v = a->a; 102149b5e25fSSatish Balay ii = a->i; 102249b5e25fSSatish Balay 102349b5e25fSSatish Balay if (!a->mult_work) { 1024854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 102549b5e25fSSatish Balay } 102649b5e25fSSatish Balay work = a->mult_work; 102749b5e25fSSatish Balay 102849b5e25fSSatish Balay 102949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 103049b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 103149b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 103298c9bda7SSatish Balay nonzerorow += (n>0); 103349b5e25fSSatish Balay 103449b5e25fSSatish Balay /* upper triangular part */ 103549b5e25fSSatish Balay for (j=0; j<n; j++) { 103649b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 103749b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 103849b5e25fSSatish Balay workt += bs; 103949b5e25fSSatish Balay } 104049b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 104196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 104249b5e25fSSatish Balay 104349b5e25fSSatish Balay /* strict lower triangular part */ 1044831a3094SHong Zhang idx = aj+ii[0]; 1045831a3094SHong Zhang if (*idx == i) { 104696b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1047831a3094SHong Zhang } 104849b5e25fSSatish Balay if (ncols > 0) { 104949b5e25fSSatish Balay workt = work; 105087828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 105196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1052831a3094SHong Zhang for (j=0; j<n; j++) { 1053831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 105449b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 105549b5e25fSSatish Balay workt += bs; 105649b5e25fSSatish Balay } 105749b5e25fSSatish Balay } 105849b5e25fSSatish Balay 105949b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 106049b5e25fSSatish Balay } 106149b5e25fSSatish Balay 1062d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1063bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 106449b5e25fSSatish Balay 1065dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 106649b5e25fSSatish Balay PetscFunctionReturn(0); 106749b5e25fSSatish Balay } 106849b5e25fSSatish Balay 10694a2ae208SSatish Balay #undef __FUNCT__ 10704a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ" 1071f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 107249b5e25fSSatish Balay { 107349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1074f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1075efee365bSSatish Balay PetscErrorCode ierr; 1076c5df96a5SBarry Smith PetscBLASInt one = 1,totalnz; 107749b5e25fSSatish Balay 107849b5e25fSSatish Balay PetscFunctionBegin; 1079c5df96a5SBarry Smith ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 10808b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1081efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 108249b5e25fSSatish Balay PetscFunctionReturn(0); 108349b5e25fSSatish Balay } 108449b5e25fSSatish Balay 10854a2ae208SSatish Balay #undef __FUNCT__ 10864a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ" 1087dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 108849b5e25fSSatish Balay { 108949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1090d9ca1df4SBarry Smith const MatScalar *v = a->a; 109149b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1092d9ca1df4SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 10936849ba73SBarry Smith PetscErrorCode ierr; 1094d9ca1df4SBarry Smith const PetscInt *aj=a->j,*col; 109549b5e25fSSatish Balay 109649b5e25fSSatish Balay PetscFunctionBegin; 109749b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 109849b5e25fSSatish Balay for (k=0; k<mbs; k++) { 109949b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1100831a3094SHong Zhang col = aj + jmin; 1101831a3094SHong Zhang if (*col == k) { /* diagonal block */ 110249b5e25fSSatish Balay for (i=0; i<bs2; i++) { 110349b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 110449b5e25fSSatish Balay } 1105831a3094SHong Zhang jmin++; 1106831a3094SHong Zhang } 1107831a3094SHong Zhang for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 110849b5e25fSSatish Balay for (i=0; i<bs2; i++) { 110949b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 111049b5e25fSSatish Balay } 111149b5e25fSSatish Balay } 111249b5e25fSSatish Balay } 11138f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2*sum_off); 11140b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1115dcca6d9dSJed Brown ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 11160b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 11170b8dc8d2SHong Zhang il[0] = 0; 111849b5e25fSSatish Balay 111949b5e25fSSatish Balay *norm = 0.0; 112049b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 112149b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 112249b5e25fSSatish Balay /*-- col sum --*/ 112349b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1124831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1125831a3094SHong Zhang at step k */ 112649b5e25fSSatish Balay while (i<mbs) { 112749b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 112849b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 112949b5e25fSSatish Balay for (j=0; j<bs; j++) { 113049b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 113149b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 113249b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 113349b5e25fSSatish Balay } 113449b5e25fSSatish Balay } 113549b5e25fSSatish Balay /* update il, jl */ 1136831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1137831a3094SHong Zhang jmax = a->i[i+1]; 113849b5e25fSSatish Balay if (jmin < jmax) { 113949b5e25fSSatish Balay il[i] = jmin; 114049b5e25fSSatish Balay j = a->j[jmin]; 114149b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 114249b5e25fSSatish Balay } 114349b5e25fSSatish Balay i = nexti; 114449b5e25fSSatish Balay } 114549b5e25fSSatish Balay /*-- row sum --*/ 114649b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 114749b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 114849b5e25fSSatish Balay for (j=0; j<bs; j++) { 114949b5e25fSSatish Balay v = a->a + i*bs2 + j; 115049b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 11510b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 115249b5e25fSSatish Balay } 115349b5e25fSSatish Balay } 115449b5e25fSSatish Balay } 115549b5e25fSSatish Balay /* add k_th block row to il, jl */ 1156831a3094SHong Zhang col = aj+jmin; 1157831a3094SHong Zhang if (*col == k) jmin++; 115849b5e25fSSatish Balay if (jmin < jmax) { 115949b5e25fSSatish Balay il[k] = jmin; 11600b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 116149b5e25fSSatish Balay } 116249b5e25fSSatish Balay for (j=0; j<bs; j++) { 116349b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 116449b5e25fSSatish Balay } 116549b5e25fSSatish Balay } 116674ed9c26SBarry Smith ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1167f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 116849b5e25fSSatish Balay PetscFunctionReturn(0); 116949b5e25fSSatish Balay } 117049b5e25fSSatish Balay 11714a2ae208SSatish Balay #undef __FUNCT__ 11724a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ" 1173ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 117449b5e25fSSatish Balay { 117549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1176dfbe8321SBarry Smith PetscErrorCode ierr; 117749b5e25fSSatish Balay 117849b5e25fSSatish Balay PetscFunctionBegin; 117949b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1180d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1181ef511fbeSHong Zhang *flg = PETSC_FALSE; 1182ef511fbeSHong Zhang PetscFunctionReturn(0); 118349b5e25fSSatish Balay } 118449b5e25fSSatish Balay 118549b5e25fSSatish Balay /* if the a->i are the same */ 118613f74950SBarry Smith ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 118726fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 118849b5e25fSSatish Balay 118949b5e25fSSatish Balay /* if a->j are the same */ 119013f74950SBarry Smith ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 119126fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 119226fbe8dcSKarl Rupp 119349b5e25fSSatish Balay /* if a->a are the same */ 1194d0f46423SBarry Smith ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1195935af2e7SHong Zhang PetscFunctionReturn(0); 119649b5e25fSSatish Balay } 119749b5e25fSSatish Balay 11984a2ae208SSatish Balay #undef __FUNCT__ 11994a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1200dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 120149b5e25fSSatish Balay { 120249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1203dfbe8321SBarry Smith PetscErrorCode ierr; 1204d9ca1df4SBarry Smith PetscInt i,j,k,row,bs,ambs,bs2; 1205d9ca1df4SBarry Smith const PetscInt *ai,*aj; 120687828ca2SBarry Smith PetscScalar *x,zero = 0.0; 1207d9ca1df4SBarry Smith const MatScalar *aa,*aa_j; 120849b5e25fSSatish Balay 120949b5e25fSSatish Balay PetscFunctionBegin; 1210d0f46423SBarry Smith bs = A->rmap->bs; 1211e32f2f54SBarry Smith if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 121282799104SHong Zhang 121349b5e25fSSatish Balay aa = a->a; 12148a0c6efdSHong Zhang ambs = a->mbs; 12158a0c6efdSHong Zhang 12168a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 12178a0c6efdSHong Zhang PetscInt *diag=a->diag; 12188a0c6efdSHong Zhang aa = a->a; 12198a0c6efdSHong Zhang ambs = a->mbs; 12208a0c6efdSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 12218a0c6efdSHong Zhang for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 12228a0c6efdSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 12238a0c6efdSHong Zhang PetscFunctionReturn(0); 12248a0c6efdSHong Zhang } 12258a0c6efdSHong Zhang 122649b5e25fSSatish Balay ai = a->i; 122749b5e25fSSatish Balay aj = a->j; 122849b5e25fSSatish Balay bs2 = a->bs2; 12292dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 12301ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 123149b5e25fSSatish Balay for (i=0; i<ambs; i++) { 123249b5e25fSSatish Balay j=ai[i]; 123349b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 123449b5e25fSSatish Balay row = i*bs; 123549b5e25fSSatish Balay aa_j = aa + j*bs2; 123649b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 123749b5e25fSSatish Balay } 123849b5e25fSSatish Balay } 12391ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 124049b5e25fSSatish Balay PetscFunctionReturn(0); 124149b5e25fSSatish Balay } 124249b5e25fSSatish Balay 12434a2ae208SSatish Balay #undef __FUNCT__ 12444a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1245dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 124649b5e25fSSatish Balay { 124749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1248d9ca1df4SBarry Smith PetscScalar x; 1249d9ca1df4SBarry Smith const PetscScalar *l,*li,*ri; 125049b5e25fSSatish Balay MatScalar *aa,*v; 1251dfbe8321SBarry Smith PetscErrorCode ierr; 12525e90f9d9SHong Zhang PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1253ace3abfcSBarry Smith PetscBool flg; 125449b5e25fSSatish Balay 125549b5e25fSSatish Balay PetscFunctionBegin; 1256b3bf805bSHong Zhang if (ll != rr) { 1257b3bf805bSHong Zhang ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1258e7e72b3dSBarry Smith if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1259b3bf805bSHong Zhang } 1260b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 126149b5e25fSSatish Balay ai = a->i; 126249b5e25fSSatish Balay aj = a->j; 126349b5e25fSSatish Balay aa = a->a; 1264d0f46423SBarry Smith m = A->rmap->N; 1265d0f46423SBarry Smith bs = A->rmap->bs; 126649b5e25fSSatish Balay mbs = a->mbs; 126749b5e25fSSatish Balay bs2 = a->bs2; 126849b5e25fSSatish Balay 1269d9ca1df4SBarry Smith ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 127049b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1271e32f2f54SBarry Smith if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 127249b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 127349b5e25fSSatish Balay M = ai[i+1] - ai[i]; 127449b5e25fSSatish Balay li = l + i*bs; 127549b5e25fSSatish Balay v = aa + bs2*ai[i]; 127649b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 127749b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 12785e90f9d9SHong Zhang for (k=0; k<bs; k++) { 127949b5e25fSSatish Balay x = ri[k]; 128049b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 128149b5e25fSSatish Balay } 128249b5e25fSSatish Balay } 128349b5e25fSSatish Balay } 1284d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1285dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 128649b5e25fSSatish Balay PetscFunctionReturn(0); 128749b5e25fSSatish Balay } 128849b5e25fSSatish Balay 12894a2ae208SSatish Balay #undef __FUNCT__ 12904a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1291dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 129249b5e25fSSatish Balay { 129349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 129449b5e25fSSatish Balay 129549b5e25fSSatish Balay PetscFunctionBegin; 129649b5e25fSSatish Balay info->block_size = a->bs2; 1297ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 12986c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 129949b5e25fSSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 130049b5e25fSSatish Balay info->assemblies = A->num_ass; 13018e58a170SBarry Smith info->mallocs = A->info.mallocs; 13027adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1303d5f3da31SBarry Smith if (A->factortype) { 130449b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 130549b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 130649b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 130749b5e25fSSatish Balay } else { 130849b5e25fSSatish Balay info->fill_ratio_given = 0; 130949b5e25fSSatish Balay info->fill_ratio_needed = 0; 131049b5e25fSSatish Balay info->factor_mallocs = 0; 131149b5e25fSSatish Balay } 131249b5e25fSSatish Balay PetscFunctionReturn(0); 131349b5e25fSSatish Balay } 131449b5e25fSSatish Balay 131549b5e25fSSatish Balay 13164a2ae208SSatish Balay #undef __FUNCT__ 13174a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1318dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 131949b5e25fSSatish Balay { 132049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1321dfbe8321SBarry Smith PetscErrorCode ierr; 132249b5e25fSSatish Balay 132349b5e25fSSatish Balay PetscFunctionBegin; 132449b5e25fSSatish Balay ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 132549b5e25fSSatish Balay PetscFunctionReturn(0); 132649b5e25fSSatish Balay } 1327dc354874SHong Zhang 13284a2ae208SSatish Balay #undef __FUNCT__ 1329985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1330985db425SBarry Smith /* 1331985db425SBarry Smith This code does not work since it only checks the upper triangular part of 1332985db425SBarry Smith the matrix. Hence it is not listed in the function table. 1333985db425SBarry Smith */ 1334985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1335dc354874SHong Zhang { 1336dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1337dfbe8321SBarry Smith PetscErrorCode ierr; 1338d9ca1df4SBarry Smith PetscInt i,j,n,row,col,bs,mbs; 1339d9ca1df4SBarry Smith const PetscInt *ai,*aj; 1340c3fca9a7SHong Zhang PetscReal atmp; 1341d9ca1df4SBarry Smith const MatScalar *aa; 1342985db425SBarry Smith PetscScalar *x; 134313f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1344dc354874SHong Zhang 1345dc354874SHong Zhang PetscFunctionBegin; 1346e32f2f54SBarry Smith if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1347e32f2f54SBarry Smith if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1348d0f46423SBarry Smith bs = A->rmap->bs; 1349dc354874SHong Zhang aa = a->a; 1350dc354874SHong Zhang ai = a->i; 1351dc354874SHong Zhang aj = a->j; 135244117c81SHong Zhang mbs = a->mbs; 1353dc354874SHong Zhang 1354985db425SBarry Smith ierr = VecSet(v,0.0);CHKERRQ(ierr); 13551ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1356dc354874SHong Zhang ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1357e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 135844117c81SHong Zhang for (i=0; i<mbs; i++) { 1359d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1360d0f6400bSHong Zhang brow = bs*i; 136144117c81SHong Zhang for (j=0; j<ncols; j++) { 1362d0f6400bSHong Zhang bcol = bs*(*aj); 136344117c81SHong Zhang for (kcol=0; kcol<bs; kcol++) { 1364d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 136544117c81SHong Zhang for (krow=0; krow<bs; krow++) { 1366d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1367d0f6400bSHong Zhang row = brow + krow; /* row index */ 1368c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1369c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 137044117c81SHong Zhang } 137144117c81SHong Zhang } 1372d0f6400bSHong Zhang aj++; 1373dc354874SHong Zhang } 1374dc354874SHong Zhang } 13751ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1376dc354874SHong Zhang PetscFunctionReturn(0); 1377dc354874SHong Zhang } 1378