149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c6db04a5SJed Brown #include <../src/mat/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h> 6c6db04a5SJed Brown #include <petscblaslapack.h> 749b5e25fSSatish Balay 84a2ae208SSatish Balay #undef __FUNCT__ 94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 1149b5e25fSSatish Balay { 125eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 136849ba73SBarry Smith PetscErrorCode ierr; 145d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 155d0c19d7SBarry Smith const PetscInt *idx; 16*db41cccfSHong Zhang PetscBT table_out,table_in; 17d94109b8SHong Zhang 18d94109b8SHong Zhang PetscFunctionBegin; 19e32f2f54SBarry Smith if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20d94109b8SHong Zhang mbs = a->mbs; 21d94109b8SHong Zhang ai = a->i; 22d94109b8SHong Zhang aj = a->j; 23d0f46423SBarry Smith bs = A->rmap->bs; 24*db41cccfSHong Zhang ierr = PetscBTCreate(mbs,table_out);CHKERRQ(ierr); 2513f74950SBarry Smith ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26d0f46423SBarry Smith ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27*db41cccfSHong Zhang ierr = PetscBTCreate(mbs,table_in);CHKERRQ(ierr); 28d94109b8SHong Zhang 29d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 30d94109b8SHong Zhang isz = 0; 31*db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32d94109b8SHong Zhang 33d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 34d94109b8SHong Zhang ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35d94109b8SHong Zhang ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36d94109b8SHong Zhang 37*db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38dbe03f88SHong Zhang bcol_max = 0; 39d94109b8SHong Zhang for (j=0; j<n ; ++j){ 40d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 41e32f2f54SBarry Smith if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42*db41cccfSHong Zhang if(!PetscBTLookupSet(table_out,brow)) { 43dbe03f88SHong Zhang nidx[isz++] = brow; 44dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 45dbe03f88SHong Zhang } 46d94109b8SHong Zhang } 47d94109b8SHong Zhang ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48d94109b8SHong Zhang ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49d94109b8SHong Zhang 50d94109b8SHong Zhang k = 0; 51d94109b8SHong Zhang for (j=0; j<ov; j++){ /* for each overlap */ 52*db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53*db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54*db41cccfSHong Zhang for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55d94109b8SHong Zhang 56d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 57d94109b8SHong Zhang for (brow=0; brow<mbs; brow++){ 58d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 59*db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60d94109b8SHong Zhang for (l = start; l<end ; l++){ 61d94109b8SHong Zhang bcol = aj[l]; 62*db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,bcol)) {nidx[isz++] = bcol;} 63d94109b8SHong Zhang } 64d94109b8SHong Zhang k++; 65d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67d94109b8SHong Zhang for (l = start; l<end ; l++){ 68d94109b8SHong Zhang bcol = aj[l]; 69dbe03f88SHong Zhang if (bcol > bcol_max) break; 70*db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)){ 71*db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) {nidx[isz++] = brow;} 72d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 73d94109b8SHong Zhang } 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } /* for each overlap */ 78d94109b8SHong Zhang 79d94109b8SHong Zhang /* expand the Index Set */ 80d94109b8SHong Zhang for (j=0; j<isz; j++) { 81d94109b8SHong Zhang for (k=0; k<bs; k++) 82d94109b8SHong Zhang nidx2[j*bs+k] = nidx[j]*bs+k; 83d94109b8SHong Zhang } 8470b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 85d94109b8SHong Zhang } 86*db41cccfSHong Zhang ierr = PetscBTDestroy(table_out);CHKERRQ(ierr); 87d94109b8SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 88d94109b8SHong Zhang ierr = PetscFree(nidx2);CHKERRQ(ierr); 89*db41cccfSHong Zhang ierr = PetscBTDestroy(table_in);CHKERRQ(ierr); 905eee224dSHong Zhang PetscFunctionReturn(0); 9149b5e25fSSatish Balay } 9249b5e25fSSatish Balay 934a2ae208SSatish Balay #undef __FUNCT__ 944a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 954aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 9649b5e25fSSatish Balay { 9749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 986849ba73SBarry Smith PetscErrorCode ierr; 9913f74950SBarry Smith PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 10013f74950SBarry Smith PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 1015d0c19d7SBarry Smith PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 1025d0c19d7SBarry Smith const PetscInt *irow,*aj = a->j,*ai = a->i; 10349b5e25fSSatish Balay MatScalar *mat_a; 10449b5e25fSSatish Balay Mat C; 105ace3abfcSBarry Smith PetscBool flag,sorted; 10649b5e25fSSatish Balay 10749b5e25fSSatish Balay PetscFunctionBegin; 108e32f2f54SBarry Smith if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 10914ca34e6SBarry Smith ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 110e32f2f54SBarry Smith if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 11149b5e25fSSatish Balay 11249b5e25fSSatish Balay ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 11349b5e25fSSatish Balay ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 11449b5e25fSSatish Balay 11574ed9c26SBarry Smith ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 11674ed9c26SBarry Smith ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 11749b5e25fSSatish Balay ssmap = smap; 11813f74950SBarry Smith ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 11949b5e25fSSatish Balay for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 12049b5e25fSSatish Balay /* determine lens of each row */ 12149b5e25fSSatish Balay for (i=0; i<nrows; i++) { 12249b5e25fSSatish Balay kstart = ai[irow[i]]; 12349b5e25fSSatish Balay kend = kstart + a->ilen[irow[i]]; 12449b5e25fSSatish Balay lens[i] = 0; 12549b5e25fSSatish Balay for (k=kstart; k<kend; k++) { 12649b5e25fSSatish Balay if (ssmap[aj[k]]) { 12749b5e25fSSatish Balay lens[i]++; 12849b5e25fSSatish Balay } 12949b5e25fSSatish Balay } 13049b5e25fSSatish Balay } 13149b5e25fSSatish Balay /* Create and fill new matrix */ 13249b5e25fSSatish Balay if (scall == MAT_REUSE_MATRIX) { 13349b5e25fSSatish Balay c = (Mat_SeqSBAIJ *)((*B)->data); 13449b5e25fSSatish Balay 135e32f2f54SBarry Smith if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 13613f74950SBarry Smith ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137e7e72b3dSBarry Smith if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 13813f74950SBarry Smith ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 13949b5e25fSSatish Balay C = *B; 14049b5e25fSSatish Balay } else { 1417adad957SLisandro Dalcin ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 142f69a0ea3SMatthew Knepley ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1437adad957SLisandro Dalcin ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 144ab93d7beSBarry Smith ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 14549b5e25fSSatish Balay } 14649b5e25fSSatish Balay c = (Mat_SeqSBAIJ *)(C->data); 14749b5e25fSSatish Balay for (i=0; i<nrows; i++) { 14849b5e25fSSatish Balay row = irow[i]; 14949b5e25fSSatish Balay kstart = ai[row]; 15049b5e25fSSatish Balay kend = kstart + a->ilen[row]; 15149b5e25fSSatish Balay mat_i = c->i[i]; 15249b5e25fSSatish Balay mat_j = c->j + mat_i; 15349b5e25fSSatish Balay mat_a = c->a + mat_i*bs2; 15449b5e25fSSatish Balay mat_ilen = c->ilen + i; 15549b5e25fSSatish Balay for (k=kstart; k<kend; k++) { 15649b5e25fSSatish Balay if ((tcol=ssmap[a->j[k]])) { 15749b5e25fSSatish Balay *mat_j++ = tcol - 1; 15849b5e25fSSatish Balay ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 15949b5e25fSSatish Balay mat_a += bs2; 16049b5e25fSSatish Balay (*mat_ilen)++; 16149b5e25fSSatish Balay } 16249b5e25fSSatish Balay } 16349b5e25fSSatish Balay } 16449b5e25fSSatish Balay 16549b5e25fSSatish Balay /* Free work space */ 16649b5e25fSSatish Balay ierr = PetscFree(smap);CHKERRQ(ierr); 16749b5e25fSSatish Balay ierr = PetscFree(lens);CHKERRQ(ierr); 16849b5e25fSSatish Balay ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 16949b5e25fSSatish Balay ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 17049b5e25fSSatish Balay 17149b5e25fSSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 17249b5e25fSSatish Balay *B = C; 17349b5e25fSSatish Balay PetscFunctionReturn(0); 17449b5e25fSSatish Balay } 17549b5e25fSSatish Balay 1764a2ae208SSatish Balay #undef __FUNCT__ 1774a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 1784aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 17949b5e25fSSatish Balay { 18049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 18149b5e25fSSatish Balay IS is1; 1826849ba73SBarry Smith PetscErrorCode ierr; 1835d0c19d7SBarry Smith PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 1845d0c19d7SBarry Smith const PetscInt *irow; 18549b5e25fSSatish Balay 18649b5e25fSSatish Balay PetscFunctionBegin; 187e32f2f54SBarry Smith if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 18849b5e25fSSatish Balay 18949b5e25fSSatish Balay ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 19049b5e25fSSatish Balay ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 19149b5e25fSSatish Balay 19249b5e25fSSatish Balay /* Verify if the indices corespond to each element in a block 19349b5e25fSSatish Balay and form the IS with compressed IS */ 19474ed9c26SBarry Smith ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 19574ed9c26SBarry Smith ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 19649b5e25fSSatish Balay for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 19749b5e25fSSatish Balay 19849b5e25fSSatish Balay count = 0; 19949b5e25fSSatish Balay for (i=0; i<a->mbs; i++) { 200e32f2f54SBarry Smith if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 20149b5e25fSSatish Balay if (vary[i]==bs) iary[count++] = i; 20249b5e25fSSatish Balay } 20349b5e25fSSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 20470b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 20574ed9c26SBarry Smith ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 20649b5e25fSSatish Balay 2074aa3045dSJed Brown ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 20874ed9c26SBarry Smith ierr = ISDestroy(is1);CHKERRQ(ierr); 20949b5e25fSSatish Balay PetscFunctionReturn(0); 21049b5e25fSSatish Balay } 21149b5e25fSSatish Balay 2124a2ae208SSatish Balay #undef __FUNCT__ 2134a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 21413f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 21549b5e25fSSatish Balay { 2166849ba73SBarry Smith PetscErrorCode ierr; 21713f74950SBarry Smith PetscInt i; 21849b5e25fSSatish Balay 21949b5e25fSSatish Balay PetscFunctionBegin; 22049b5e25fSSatish Balay if (scall == MAT_INITIAL_MATRIX) { 22182502324SSatish Balay ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 22249b5e25fSSatish Balay } 22349b5e25fSSatish Balay 22449b5e25fSSatish Balay for (i=0; i<n; i++) { 2254aa3045dSJed Brown ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 22649b5e25fSSatish Balay } 22749b5e25fSSatish Balay PetscFunctionReturn(0); 22849b5e25fSSatish Balay } 22949b5e25fSSatish Balay 23049b5e25fSSatish Balay /* -------------------------------------------------------*/ 23149b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 23249b5e25fSSatish Balay /* -------------------------------------------------------*/ 23349b5e25fSSatish Balay 2344a2ae208SSatish Balay #undef __FUNCT__ 2354a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2" 236dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 23749b5e25fSSatish Balay { 23849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 23987828ca2SBarry Smith PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 24049b5e25fSSatish Balay MatScalar *v; 2416849ba73SBarry Smith PetscErrorCode ierr; 24213f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 24398c9bda7SSatish Balay PetscInt nonzerorow=0; 24449b5e25fSSatish Balay 24549b5e25fSSatish Balay PetscFunctionBegin; 2462dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 2471ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2481ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 24949b5e25fSSatish Balay 25049b5e25fSSatish Balay v = a->a; 25149b5e25fSSatish Balay xb = x; 25249b5e25fSSatish Balay 25349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 25449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 25549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 25649b5e25fSSatish Balay ib = aj + *ai; 257831a3094SHong Zhang jmin = 0; 25898c9bda7SSatish Balay nonzerorow += (n>0); 2597fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 26049b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 26149b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 262831a3094SHong Zhang v += 4; jmin++; 2637fbae186SHong Zhang } 264444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 265444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 266831a3094SHong Zhang for (j=jmin; j<n; j++) { 26749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 26849b5e25fSSatish Balay cval = ib[j]*2; 26949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 27049b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 27149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 27249b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 27349b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 27449b5e25fSSatish Balay v += 4; 27549b5e25fSSatish Balay } 27649b5e25fSSatish Balay xb +=2; ai++; 27749b5e25fSSatish Balay } 27849b5e25fSSatish Balay 2791ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2801ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 281dc0b31edSSatish Balay ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 28249b5e25fSSatish Balay PetscFunctionReturn(0); 28349b5e25fSSatish Balay } 28449b5e25fSSatish Balay 2854a2ae208SSatish Balay #undef __FUNCT__ 2864a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3" 287dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 28849b5e25fSSatish Balay { 28949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 29087828ca2SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 29149b5e25fSSatish Balay MatScalar *v; 2926849ba73SBarry Smith PetscErrorCode ierr; 29313f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 29498c9bda7SSatish Balay PetscInt nonzerorow=0; 29549b5e25fSSatish Balay 29649b5e25fSSatish Balay PetscFunctionBegin; 2972dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 2981ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2991ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 30049b5e25fSSatish Balay 30149b5e25fSSatish Balay v = a->a; 30249b5e25fSSatish Balay xb = x; 30349b5e25fSSatish Balay 30449b5e25fSSatish Balay for (i=0; i<mbs; i++) { 30549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 30649b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 30749b5e25fSSatish Balay ib = aj + *ai; 308831a3094SHong Zhang jmin = 0; 30998c9bda7SSatish Balay nonzerorow += (n>0); 3107fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 31149b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 31249b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 31349b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 314831a3094SHong Zhang v += 9; jmin++; 3157fbae186SHong Zhang } 316444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 317444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 318831a3094SHong Zhang for (j=jmin; j<n; j++) { 31949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32049b5e25fSSatish Balay cval = ib[j]*3; 32149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 32249b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 32349b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 32449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32549b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 32649b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 32749b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 32849b5e25fSSatish Balay v += 9; 32949b5e25fSSatish Balay } 33049b5e25fSSatish Balay xb +=3; ai++; 33149b5e25fSSatish Balay } 33249b5e25fSSatish Balay 3331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3341ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 335dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 33649b5e25fSSatish Balay PetscFunctionReturn(0); 33749b5e25fSSatish Balay } 33849b5e25fSSatish Balay 3394a2ae208SSatish Balay #undef __FUNCT__ 3404a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4" 341dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 34249b5e25fSSatish Balay { 34349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 34487828ca2SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 34549b5e25fSSatish Balay MatScalar *v; 3466849ba73SBarry Smith PetscErrorCode ierr; 34713f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 34898c9bda7SSatish Balay PetscInt nonzerorow=0; 34949b5e25fSSatish Balay 35049b5e25fSSatish Balay PetscFunctionBegin; 3512dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 3521ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3531ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 35449b5e25fSSatish Balay 35549b5e25fSSatish Balay v = a->a; 35649b5e25fSSatish Balay xb = x; 35749b5e25fSSatish Balay 35849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 35949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 36049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 36149b5e25fSSatish Balay ib = aj + *ai; 362831a3094SHong Zhang jmin = 0; 36398c9bda7SSatish Balay nonzerorow += (n>0); 3647fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 36549b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 36649b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 36749b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 36849b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 369831a3094SHong Zhang v += 16; jmin++; 3707fbae186SHong Zhang } 371444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 372444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 373831a3094SHong Zhang for (j=jmin; j<n; j++) { 37449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 37549b5e25fSSatish Balay cval = ib[j]*4; 37649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 37749b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 37849b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 37949b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 38049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 38149b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 38249b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 38349b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 38449b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 38549b5e25fSSatish Balay v += 16; 38649b5e25fSSatish Balay } 38749b5e25fSSatish Balay xb +=4; ai++; 38849b5e25fSSatish Balay } 38949b5e25fSSatish Balay 3901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3911ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 392dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 39349b5e25fSSatish Balay PetscFunctionReturn(0); 39449b5e25fSSatish Balay } 39549b5e25fSSatish Balay 3964a2ae208SSatish Balay #undef __FUNCT__ 3974a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5" 398dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 39949b5e25fSSatish Balay { 40049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 40187828ca2SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 40249b5e25fSSatish Balay MatScalar *v; 4036849ba73SBarry Smith PetscErrorCode ierr; 40413f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 40598c9bda7SSatish Balay PetscInt nonzerorow=0; 40649b5e25fSSatish Balay 40749b5e25fSSatish Balay PetscFunctionBegin; 4082dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 4091ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4101ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 41149b5e25fSSatish Balay 41249b5e25fSSatish Balay v = a->a; 41349b5e25fSSatish Balay xb = x; 41449b5e25fSSatish Balay 41549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 41649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 41749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 41849b5e25fSSatish Balay ib = aj + *ai; 419831a3094SHong Zhang jmin = 0; 42098c9bda7SSatish Balay nonzerorow += (n>0); 4217fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 42249b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 42349b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 42449b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 42549b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 42649b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 427831a3094SHong Zhang v += 25; jmin++; 4287fbae186SHong Zhang } 429444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 430444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 431831a3094SHong Zhang for (j=jmin; j<n; j++) { 43249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 43349b5e25fSSatish Balay cval = ib[j]*5; 43449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 43549b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 43649b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 43749b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 43849b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 43949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 44049b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 44149b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 44249b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 44349b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 44449b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 44549b5e25fSSatish Balay v += 25; 44649b5e25fSSatish Balay } 44749b5e25fSSatish Balay xb +=5; ai++; 44849b5e25fSSatish Balay } 44949b5e25fSSatish Balay 4501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4511ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 452dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 45349b5e25fSSatish Balay PetscFunctionReturn(0); 45449b5e25fSSatish Balay } 45549b5e25fSSatish Balay 45649b5e25fSSatish Balay 4574a2ae208SSatish Balay #undef __FUNCT__ 4584a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6" 459dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 46049b5e25fSSatish Balay { 46149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 46287828ca2SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 46349b5e25fSSatish Balay MatScalar *v; 4646849ba73SBarry Smith PetscErrorCode ierr; 46513f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 46698c9bda7SSatish Balay PetscInt nonzerorow=0; 46749b5e25fSSatish Balay 46849b5e25fSSatish Balay PetscFunctionBegin; 4692dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 4701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4711ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 47249b5e25fSSatish Balay 47349b5e25fSSatish Balay v = a->a; 47449b5e25fSSatish Balay xb = x; 47549b5e25fSSatish Balay 47649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 47749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 47849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 47949b5e25fSSatish Balay ib = aj + *ai; 480831a3094SHong Zhang jmin = 0; 48198c9bda7SSatish Balay nonzerorow += (n>0); 4827fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 48349b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 48449b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 48549b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 48649b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 48749b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 48849b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 489831a3094SHong Zhang v += 36; jmin++; 4907fbae186SHong Zhang } 491444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 492444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 493831a3094SHong Zhang for (j=jmin; j<n; j++) { 49449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 49549b5e25fSSatish Balay cval = ib[j]*6; 49649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 49749b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 49849b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 49949b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 50049b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 50149b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 50249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 50349b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 50449b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 50549b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 50649b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 50749b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 50849b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 50949b5e25fSSatish Balay v += 36; 51049b5e25fSSatish Balay } 51149b5e25fSSatish Balay xb +=6; ai++; 51249b5e25fSSatish Balay } 51349b5e25fSSatish Balay 5141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5151ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 516dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 51749b5e25fSSatish Balay PetscFunctionReturn(0); 51849b5e25fSSatish Balay } 5194a2ae208SSatish Balay #undef __FUNCT__ 5204a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7" 521dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 52249b5e25fSSatish Balay { 52349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 52487828ca2SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 52549b5e25fSSatish Balay MatScalar *v; 5266849ba73SBarry Smith PetscErrorCode ierr; 52713f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 52898c9bda7SSatish Balay PetscInt nonzerorow=0; 52949b5e25fSSatish Balay 53049b5e25fSSatish Balay PetscFunctionBegin; 5312dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 5321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5331ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 53449b5e25fSSatish Balay 53549b5e25fSSatish Balay v = a->a; 53649b5e25fSSatish Balay xb = x; 53749b5e25fSSatish Balay 53849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 53949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 54049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 54149b5e25fSSatish Balay ib = aj + *ai; 542831a3094SHong Zhang jmin = 0; 54398c9bda7SSatish Balay nonzerorow += (n>0); 5447fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 54549b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 54649b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 54749b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 54849b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 54949b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 55049b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 55149b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 552831a3094SHong Zhang v += 49; jmin++; 5537fbae186SHong Zhang } 554444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 555444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 556831a3094SHong Zhang for (j=jmin; j<n; j++) { 55749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 55849b5e25fSSatish Balay cval = ib[j]*7; 55949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 56049b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 56149b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 56249b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 56349b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 56449b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 56549b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 56649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 56749b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 56849b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 56949b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 57049b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 57149b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 57249b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 57349b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 57449b5e25fSSatish Balay v += 49; 57549b5e25fSSatish Balay } 57649b5e25fSSatish Balay xb +=7; ai++; 57749b5e25fSSatish Balay } 5781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5791ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 580dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 58149b5e25fSSatish Balay PetscFunctionReturn(0); 58249b5e25fSSatish Balay } 58349b5e25fSSatish Balay 58449b5e25fSSatish Balay /* 58549b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 58649b5e25fSSatish Balay */ 5874a2ae208SSatish Balay #undef __FUNCT__ 5884a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N" 589dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 59049b5e25fSSatish Balay { 59149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 59287828ca2SBarry Smith PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 5930b60a74dSHong Zhang MatScalar *v; 594dfbe8321SBarry Smith PetscErrorCode ierr; 595d0f46423SBarry Smith PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 59698c9bda7SSatish Balay PetscInt nonzerorow=0; 59749b5e25fSSatish Balay 59849b5e25fSSatish Balay PetscFunctionBegin; 5992dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 6001ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 6011ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 60249b5e25fSSatish Balay 60349b5e25fSSatish Balay aj = a->j; 60449b5e25fSSatish Balay v = a->a; 60549b5e25fSSatish Balay ii = a->i; 60649b5e25fSSatish Balay 60749b5e25fSSatish Balay if (!a->mult_work) { 608d0f46423SBarry Smith ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 60949b5e25fSSatish Balay } 61049b5e25fSSatish Balay work = a->mult_work; 61149b5e25fSSatish Balay 61249b5e25fSSatish Balay for (i=0; i<mbs; i++) { 61349b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 61449b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 61598c9bda7SSatish Balay nonzerorow += (n>0); 61649b5e25fSSatish Balay 61749b5e25fSSatish Balay /* upper triangular part */ 61849b5e25fSSatish Balay for (j=0; j<n; j++) { 61949b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 62049b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 62149b5e25fSSatish Balay workt += bs; 62249b5e25fSSatish Balay } 62349b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 62449b5e25fSSatish Balay Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 62549b5e25fSSatish Balay 62649b5e25fSSatish Balay /* strict lower triangular part */ 627831a3094SHong Zhang idx = aj+ii[0]; 628831a3094SHong Zhang if (*idx == i){ 62996b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 630831a3094SHong Zhang } 63196b9376eSHong Zhang 63249b5e25fSSatish Balay if (ncols > 0){ 63349b5e25fSSatish Balay workt = work; 63487828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 635831a3094SHong Zhang Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 636831a3094SHong Zhang for (j=0; j<n; j++) { 637831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 63849b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k] ; 63949b5e25fSSatish Balay workt += bs; 64049b5e25fSSatish Balay } 64149b5e25fSSatish Balay } 64249b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 64349b5e25fSSatish Balay } 64449b5e25fSSatish Balay 6451ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 6461ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 647dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 64849b5e25fSSatish Balay PetscFunctionReturn(0); 64949b5e25fSSatish Balay } 65049b5e25fSSatish Balay 6514a2ae208SSatish Balay #undef __FUNCT__ 6524a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 653dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 65449b5e25fSSatish Balay { 65549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 656bba805e6SBarry Smith PetscScalar *x,*z,*xb,x1; 65749b5e25fSSatish Balay MatScalar *v; 6586849ba73SBarry Smith PetscErrorCode ierr; 65913f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 66098c9bda7SSatish Balay PetscInt nonzerorow=0; 66149b5e25fSSatish Balay 66249b5e25fSSatish Balay PetscFunctionBegin; 663343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 6641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 6651ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 66649b5e25fSSatish Balay v = a->a; 66749b5e25fSSatish Balay xb = x; 66849b5e25fSSatish Balay 66949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 67049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 67149b5e25fSSatish Balay x1 = xb[0]; 67249b5e25fSSatish Balay ib = aj + *ai; 673831a3094SHong Zhang jmin = 0; 67498c9bda7SSatish Balay nonzerorow += (n>0); 675831a3094SHong Zhang if (*ib == i) { /* (diag of A)*x */ 676831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 677831a3094SHong Zhang } 678831a3094SHong Zhang for (j=jmin; j<n; j++) { 67949b5e25fSSatish Balay cval = *ib; 68049b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 68149b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 68249b5e25fSSatish Balay } 68349b5e25fSSatish Balay xb++; ai++; 68449b5e25fSSatish Balay } 68549b5e25fSSatish Balay 6861ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 687bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 68849b5e25fSSatish Balay 689dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 69049b5e25fSSatish Balay PetscFunctionReturn(0); 69149b5e25fSSatish Balay } 69249b5e25fSSatish Balay 6934a2ae208SSatish Balay #undef __FUNCT__ 6944a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 695dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 69649b5e25fSSatish Balay { 69749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 698bba805e6SBarry Smith PetscScalar *x,*z,*xb,x1,x2; 69949b5e25fSSatish Balay MatScalar *v; 7006849ba73SBarry Smith PetscErrorCode ierr; 70113f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 70298c9bda7SSatish Balay PetscInt nonzerorow=0; 70349b5e25fSSatish Balay 70449b5e25fSSatish Balay PetscFunctionBegin; 705343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 7061ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 7071ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 70849b5e25fSSatish Balay 70949b5e25fSSatish Balay v = a->a; 71049b5e25fSSatish Balay xb = x; 71149b5e25fSSatish Balay 71249b5e25fSSatish Balay for (i=0; i<mbs; i++) { 71349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 71449b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 71549b5e25fSSatish Balay ib = aj + *ai; 716831a3094SHong Zhang jmin = 0; 71798c9bda7SSatish Balay nonzerorow += (n>0); 7187fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 71949b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 72049b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 721831a3094SHong Zhang v += 4; jmin++; 7227fbae186SHong Zhang } 723444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 724444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 725831a3094SHong Zhang for (j=jmin; j<n; j++) { 72649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 72749b5e25fSSatish Balay cval = ib[j]*2; 72849b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 72949b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 73049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 73149b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 73249b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 73349b5e25fSSatish Balay v += 4; 73449b5e25fSSatish Balay } 73549b5e25fSSatish Balay xb +=2; ai++; 73649b5e25fSSatish Balay } 7371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 738bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 73949b5e25fSSatish Balay 740dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 74149b5e25fSSatish Balay PetscFunctionReturn(0); 74249b5e25fSSatish Balay } 74349b5e25fSSatish Balay 7444a2ae208SSatish Balay #undef __FUNCT__ 7454a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 746dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 74749b5e25fSSatish Balay { 74849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 749bba805e6SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3; 75049b5e25fSSatish Balay MatScalar *v; 7516849ba73SBarry Smith PetscErrorCode ierr; 75213f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 75398c9bda7SSatish Balay PetscInt nonzerorow=0; 75449b5e25fSSatish Balay 75549b5e25fSSatish Balay PetscFunctionBegin; 756343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 7571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 7581ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 75949b5e25fSSatish Balay 76049b5e25fSSatish Balay v = a->a; 76149b5e25fSSatish Balay xb = x; 76249b5e25fSSatish Balay 76349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 76449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 76549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 76649b5e25fSSatish Balay ib = aj + *ai; 767831a3094SHong Zhang jmin = 0; 76898c9bda7SSatish Balay nonzerorow += (n>0); 7697fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 77049b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 77149b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 77249b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 773831a3094SHong Zhang v += 9; jmin++; 7747fbae186SHong Zhang } 775444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 776444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 777831a3094SHong Zhang for (j=jmin; j<n; j++) { 77849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 77949b5e25fSSatish Balay cval = ib[j]*3; 78049b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 78149b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 78249b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 78349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 78449b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 78549b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 78649b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 78749b5e25fSSatish Balay v += 9; 78849b5e25fSSatish Balay } 78949b5e25fSSatish Balay xb +=3; ai++; 79049b5e25fSSatish Balay } 79149b5e25fSSatish Balay 7921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 79449b5e25fSSatish Balay 795dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 79649b5e25fSSatish Balay PetscFunctionReturn(0); 79749b5e25fSSatish Balay } 79849b5e25fSSatish Balay 7994a2ae208SSatish Balay #undef __FUNCT__ 8004a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 801dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 80249b5e25fSSatish Balay { 80349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 804bba805e6SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4; 80549b5e25fSSatish Balay MatScalar *v; 8066849ba73SBarry Smith PetscErrorCode ierr; 80713f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 80898c9bda7SSatish Balay PetscInt nonzerorow=0; 80949b5e25fSSatish Balay 81049b5e25fSSatish Balay PetscFunctionBegin; 811343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 8121ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8131ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 81449b5e25fSSatish Balay 81549b5e25fSSatish Balay v = a->a; 81649b5e25fSSatish Balay xb = x; 81749b5e25fSSatish Balay 81849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 81949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 82049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 82149b5e25fSSatish Balay ib = aj + *ai; 822831a3094SHong Zhang jmin = 0; 82398c9bda7SSatish Balay nonzerorow += (n>0); 8247fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 82549b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 82649b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 82749b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 82849b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 829831a3094SHong Zhang v += 16; jmin++; 8307fbae186SHong Zhang } 831444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 832444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 833831a3094SHong Zhang for (j=jmin; j<n; j++) { 83449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 83549b5e25fSSatish Balay cval = ib[j]*4; 83649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 83749b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 83849b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 83949b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 84049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 84149b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 84249b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 84349b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 84449b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 84549b5e25fSSatish Balay v += 16; 84649b5e25fSSatish Balay } 84749b5e25fSSatish Balay xb +=4; ai++; 84849b5e25fSSatish Balay } 84949b5e25fSSatish Balay 8501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 851bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 85249b5e25fSSatish Balay 853dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 85449b5e25fSSatish Balay PetscFunctionReturn(0); 85549b5e25fSSatish Balay } 85649b5e25fSSatish Balay 8574a2ae208SSatish Balay #undef __FUNCT__ 8584a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 859dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 86049b5e25fSSatish Balay { 86149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 862bba805e6SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 86349b5e25fSSatish Balay MatScalar *v; 8646849ba73SBarry Smith PetscErrorCode ierr; 86513f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 86698c9bda7SSatish Balay PetscInt nonzerorow=0; 86749b5e25fSSatish Balay 86849b5e25fSSatish Balay PetscFunctionBegin; 869343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 8701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8711ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 87249b5e25fSSatish Balay 87349b5e25fSSatish Balay v = a->a; 87449b5e25fSSatish Balay xb = x; 87549b5e25fSSatish Balay 87649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 87749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 87849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 87949b5e25fSSatish Balay ib = aj + *ai; 880831a3094SHong Zhang jmin = 0; 88198c9bda7SSatish Balay nonzerorow += (n>0); 8827fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 88349b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 88449b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 88549b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 88649b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 88749b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 888831a3094SHong Zhang v += 25; jmin++; 8897fbae186SHong Zhang } 890444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 891444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 892831a3094SHong Zhang for (j=jmin; j<n; j++) { 89349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 89449b5e25fSSatish Balay cval = ib[j]*5; 89549b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 89649b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 89749b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 89849b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 89949b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 90049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 90149b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 90249b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 90349b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 90449b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 90549b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 90649b5e25fSSatish Balay v += 25; 90749b5e25fSSatish Balay } 90849b5e25fSSatish Balay xb +=5; ai++; 90949b5e25fSSatish Balay } 91049b5e25fSSatish Balay 9111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 912bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 91349b5e25fSSatish Balay 914dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 91549b5e25fSSatish Balay PetscFunctionReturn(0); 91649b5e25fSSatish Balay } 9174a2ae208SSatish Balay #undef __FUNCT__ 9184a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 919dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 92049b5e25fSSatish Balay { 92149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 922bba805e6SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 92349b5e25fSSatish Balay MatScalar *v; 9246849ba73SBarry Smith PetscErrorCode ierr; 92513f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 92698c9bda7SSatish Balay PetscInt nonzerorow=0; 92749b5e25fSSatish Balay 92849b5e25fSSatish Balay PetscFunctionBegin; 929343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 9301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 9311ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 93249b5e25fSSatish Balay 93349b5e25fSSatish Balay v = a->a; 93449b5e25fSSatish Balay xb = x; 93549b5e25fSSatish Balay 93649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 93749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 93849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 93949b5e25fSSatish Balay ib = aj + *ai; 940831a3094SHong Zhang jmin = 0; 94198c9bda7SSatish Balay nonzerorow += (n>0); 9427fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 94349b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 94449b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 94549b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 94649b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 94749b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 94849b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 949831a3094SHong Zhang v += 36; jmin++; 9507fbae186SHong Zhang } 951444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 952444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 953831a3094SHong Zhang for (j=jmin; j<n; j++) { 95449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 95549b5e25fSSatish Balay cval = ib[j]*6; 95649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 95749b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 95849b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 95949b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 96049b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 96149b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 96249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 96349b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 96449b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 96549b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 96649b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 96749b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 96849b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 96949b5e25fSSatish Balay v += 36; 97049b5e25fSSatish Balay } 97149b5e25fSSatish Balay xb +=6; ai++; 97249b5e25fSSatish Balay } 97349b5e25fSSatish Balay 9741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 975bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 97649b5e25fSSatish Balay 977dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 97849b5e25fSSatish Balay PetscFunctionReturn(0); 97949b5e25fSSatish Balay } 98049b5e25fSSatish Balay 9814a2ae208SSatish Balay #undef __FUNCT__ 9824a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 983dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 98449b5e25fSSatish Balay { 98549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 986bba805e6SBarry Smith PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 98749b5e25fSSatish Balay MatScalar *v; 9886849ba73SBarry Smith PetscErrorCode ierr; 98913f74950SBarry Smith PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 99098c9bda7SSatish Balay PetscInt nonzerorow=0; 99149b5e25fSSatish Balay 99249b5e25fSSatish Balay PetscFunctionBegin; 993343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 9941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 9951ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 99649b5e25fSSatish Balay 99749b5e25fSSatish Balay v = a->a; 99849b5e25fSSatish Balay xb = x; 99949b5e25fSSatish Balay 100049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 100149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 100249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 100349b5e25fSSatish Balay ib = aj + *ai; 1004831a3094SHong Zhang jmin = 0; 100598c9bda7SSatish Balay nonzerorow += (n>0); 10067fbae186SHong Zhang if (*ib == i){ /* (diag of A)*x */ 100749b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 100849b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 100949b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 101049b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 101149b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 101249b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 101349b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1014831a3094SHong Zhang v += 49; jmin++; 10157fbae186SHong Zhang } 1016444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1017444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1018831a3094SHong Zhang for (j=jmin; j<n; j++) { 101949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 102049b5e25fSSatish Balay cval = ib[j]*7; 102149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 102249b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 102349b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 102449b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 102549b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 102649b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 102749b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 102849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 102949b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 103049b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 103149b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 103249b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 103349b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 103449b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 103549b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 103649b5e25fSSatish Balay v += 49; 103749b5e25fSSatish Balay } 103849b5e25fSSatish Balay xb +=7; ai++; 103949b5e25fSSatish Balay } 104049b5e25fSSatish Balay 10411ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1042bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 104349b5e25fSSatish Balay 1044dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 104549b5e25fSSatish Balay PetscFunctionReturn(0); 104649b5e25fSSatish Balay } 104749b5e25fSSatish Balay 10484a2ae208SSatish Balay #undef __FUNCT__ 10494a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1050dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 105149b5e25fSSatish Balay { 105249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1053bba805e6SBarry Smith PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1054066653e3SSatish Balay MatScalar *v; 1055dfbe8321SBarry Smith PetscErrorCode ierr; 1056d0f46423SBarry Smith PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 105798c9bda7SSatish Balay PetscInt nonzerorow=0; 105849b5e25fSSatish Balay 105949b5e25fSSatish Balay PetscFunctionBegin; 1060343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 10611ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 10621ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 106349b5e25fSSatish Balay 106449b5e25fSSatish Balay aj = a->j; 106549b5e25fSSatish Balay v = a->a; 106649b5e25fSSatish Balay ii = a->i; 106749b5e25fSSatish Balay 106849b5e25fSSatish Balay if (!a->mult_work) { 1069d0f46423SBarry Smith ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 107049b5e25fSSatish Balay } 107149b5e25fSSatish Balay work = a->mult_work; 107249b5e25fSSatish Balay 107349b5e25fSSatish Balay 107449b5e25fSSatish Balay for (i=0; i<mbs; i++) { 107549b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 107649b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 107798c9bda7SSatish Balay nonzerorow += (n>0); 107849b5e25fSSatish Balay 107949b5e25fSSatish Balay /* upper triangular part */ 108049b5e25fSSatish Balay for (j=0; j<n; j++) { 108149b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 108249b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 108349b5e25fSSatish Balay workt += bs; 108449b5e25fSSatish Balay } 108549b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 108649b5e25fSSatish Balay Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 108749b5e25fSSatish Balay 108849b5e25fSSatish Balay /* strict lower triangular part */ 1089831a3094SHong Zhang idx = aj+ii[0]; 1090831a3094SHong Zhang if (*idx == i){ 109196b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1092831a3094SHong Zhang } 109349b5e25fSSatish Balay if (ncols > 0){ 109449b5e25fSSatish Balay workt = work; 109587828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1096831a3094SHong Zhang Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1097831a3094SHong Zhang for (j=0; j<n; j++) { 1098831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 109949b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k] ; 110049b5e25fSSatish Balay workt += bs; 110149b5e25fSSatish Balay } 110249b5e25fSSatish Balay } 110349b5e25fSSatish Balay 110449b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 110549b5e25fSSatish Balay } 110649b5e25fSSatish Balay 11071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1108bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 110949b5e25fSSatish Balay 1110dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 111149b5e25fSSatish Balay PetscFunctionReturn(0); 111249b5e25fSSatish Balay } 111349b5e25fSSatish Balay 11144a2ae208SSatish Balay #undef __FUNCT__ 11154a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ" 1116f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 111749b5e25fSSatish Balay { 111849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1119f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1120efee365bSSatish Balay PetscErrorCode ierr; 11210805154bSBarry Smith PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 112249b5e25fSSatish Balay 112349b5e25fSSatish Balay PetscFunctionBegin; 1124f4df32b1SMatthew Knepley BLASscal_(&totalnz,&oalpha,a->a,&one); 1125efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 112649b5e25fSSatish Balay PetscFunctionReturn(0); 112749b5e25fSSatish Balay } 112849b5e25fSSatish Balay 11294a2ae208SSatish Balay #undef __FUNCT__ 11304a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ" 1131dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 113249b5e25fSSatish Balay { 113349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 113449b5e25fSSatish Balay MatScalar *v = a->a; 113549b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1136d0f46423SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 11376849ba73SBarry Smith PetscErrorCode ierr; 113813f74950SBarry Smith PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 113949b5e25fSSatish Balay 114049b5e25fSSatish Balay PetscFunctionBegin; 114149b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 114249b5e25fSSatish Balay for (k=0; k<mbs; k++){ 114349b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1144831a3094SHong Zhang col = aj + jmin; 1145831a3094SHong Zhang if (*col == k){ /* diagonal block */ 114649b5e25fSSatish Balay for (i=0; i<bs2; i++){ 114749b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX) 114849b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 114949b5e25fSSatish Balay #else 115049b5e25fSSatish Balay sum_diag += (*v)*(*v); v++; 115149b5e25fSSatish Balay #endif 115249b5e25fSSatish Balay } 1153831a3094SHong Zhang jmin++; 1154831a3094SHong Zhang } 1155831a3094SHong Zhang for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 115649b5e25fSSatish Balay for (i=0; i<bs2; i++){ 115749b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX) 115849b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 115949b5e25fSSatish Balay #else 116049b5e25fSSatish Balay sum_off += (*v)*(*v); v++; 116149b5e25fSSatish Balay #endif 116249b5e25fSSatish Balay } 116349b5e25fSSatish Balay } 116449b5e25fSSatish Balay } 116549b5e25fSSatish Balay *norm = sqrt(sum_diag + 2*sum_off); 11660b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 116774ed9c26SBarry Smith ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 11680b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 11690b8dc8d2SHong Zhang il[0] = 0; 117049b5e25fSSatish Balay 117149b5e25fSSatish Balay *norm = 0.0; 117249b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 117349b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 117449b5e25fSSatish Balay /*-- col sum --*/ 117549b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1176831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1177831a3094SHong Zhang at step k */ 117849b5e25fSSatish Balay while (i<mbs){ 117949b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 118049b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 118149b5e25fSSatish Balay for (j=0; j<bs; j++){ 118249b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 118349b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 118449b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 118549b5e25fSSatish Balay } 118649b5e25fSSatish Balay } 118749b5e25fSSatish Balay /* update il, jl */ 1188831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1189831a3094SHong Zhang jmax = a->i[i+1]; 119049b5e25fSSatish Balay if (jmin < jmax){ 119149b5e25fSSatish Balay il[i] = jmin; 119249b5e25fSSatish Balay j = a->j[jmin]; 119349b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 119449b5e25fSSatish Balay } 119549b5e25fSSatish Balay i = nexti; 119649b5e25fSSatish Balay } 119749b5e25fSSatish Balay /*-- row sum --*/ 119849b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 119949b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 120049b5e25fSSatish Balay for (j=0; j<bs; j++){ 120149b5e25fSSatish Balay v = a->a + i*bs2 + j; 120249b5e25fSSatish Balay for (k1=0; k1<bs; k1++){ 12030b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 120449b5e25fSSatish Balay } 120549b5e25fSSatish Balay } 120649b5e25fSSatish Balay } 120749b5e25fSSatish Balay /* add k_th block row to il, jl */ 1208831a3094SHong Zhang col = aj+jmin; 1209831a3094SHong Zhang if (*col == k) jmin++; 121049b5e25fSSatish Balay if (jmin < jmax){ 121149b5e25fSSatish Balay il[k] = jmin; 12120b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 121349b5e25fSSatish Balay } 121449b5e25fSSatish Balay for (j=0; j<bs; j++){ 121549b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 121649b5e25fSSatish Balay } 121749b5e25fSSatish Balay } 121874ed9c26SBarry Smith ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 121949b5e25fSSatish Balay } else { 1220e32f2f54SBarry Smith SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 122149b5e25fSSatish Balay } 122249b5e25fSSatish Balay PetscFunctionReturn(0); 122349b5e25fSSatish Balay } 122449b5e25fSSatish Balay 12254a2ae208SSatish Balay #undef __FUNCT__ 12264a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ" 1227ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 122849b5e25fSSatish Balay { 122949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1230dfbe8321SBarry Smith PetscErrorCode ierr; 123149b5e25fSSatish Balay 123249b5e25fSSatish Balay PetscFunctionBegin; 123349b5e25fSSatish Balay 123449b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1235d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1236ef511fbeSHong Zhang *flg = PETSC_FALSE; 1237ef511fbeSHong Zhang PetscFunctionReturn(0); 123849b5e25fSSatish Balay } 123949b5e25fSSatish Balay 124049b5e25fSSatish Balay /* if the a->i are the same */ 124113f74950SBarry Smith ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1242abc0a331SBarry Smith if (!*flg) { 124349b5e25fSSatish Balay PetscFunctionReturn(0); 124449b5e25fSSatish Balay } 124549b5e25fSSatish Balay 124649b5e25fSSatish Balay /* if a->j are the same */ 124713f74950SBarry Smith ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1248abc0a331SBarry Smith if (!*flg) { 124949b5e25fSSatish Balay PetscFunctionReturn(0); 125049b5e25fSSatish Balay } 125149b5e25fSSatish Balay /* if a->a are the same */ 1252d0f46423SBarry Smith ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1253935af2e7SHong Zhang PetscFunctionReturn(0); 125449b5e25fSSatish Balay } 125549b5e25fSSatish Balay 12564a2ae208SSatish Balay #undef __FUNCT__ 12574a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1258dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 125949b5e25fSSatish Balay { 126049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1261dfbe8321SBarry Smith PetscErrorCode ierr; 126213f74950SBarry Smith PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 126387828ca2SBarry Smith PetscScalar *x,zero = 0.0; 126449b5e25fSSatish Balay MatScalar *aa,*aa_j; 126549b5e25fSSatish Balay 126649b5e25fSSatish Balay PetscFunctionBegin; 1267d0f46423SBarry Smith bs = A->rmap->bs; 1268e32f2f54SBarry Smith if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 126982799104SHong Zhang 127049b5e25fSSatish Balay aa = a->a; 127149b5e25fSSatish Balay ai = a->i; 127249b5e25fSSatish Balay aj = a->j; 127349b5e25fSSatish Balay ambs = a->mbs; 127449b5e25fSSatish Balay bs2 = a->bs2; 127549b5e25fSSatish Balay 12762dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 12771ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 127849b5e25fSSatish Balay ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1279e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 128049b5e25fSSatish Balay for (i=0; i<ambs; i++) { 128149b5e25fSSatish Balay j=ai[i]; 128249b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 128349b5e25fSSatish Balay row = i*bs; 128449b5e25fSSatish Balay aa_j = aa + j*bs2; 1285d5f3da31SBarry Smith if (A->factortype && bs==1){ 128682799104SHong Zhang for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 128782799104SHong Zhang } else { 128849b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 128949b5e25fSSatish Balay } 129049b5e25fSSatish Balay } 129182799104SHong Zhang } 129282799104SHong Zhang 12931ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 129449b5e25fSSatish Balay PetscFunctionReturn(0); 129549b5e25fSSatish Balay } 129649b5e25fSSatish Balay 12974a2ae208SSatish Balay #undef __FUNCT__ 12984a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1299dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 130049b5e25fSSatish Balay { 130149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13025e90f9d9SHong Zhang PetscScalar *l,x,*li,*ri; 130349b5e25fSSatish Balay MatScalar *aa,*v; 1304dfbe8321SBarry Smith PetscErrorCode ierr; 13055e90f9d9SHong Zhang PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1306ace3abfcSBarry Smith PetscBool flg; 130749b5e25fSSatish Balay 130849b5e25fSSatish Balay PetscFunctionBegin; 1309b3bf805bSHong Zhang if (ll != rr){ 1310b3bf805bSHong Zhang ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1311e7e72b3dSBarry Smith if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1312b3bf805bSHong Zhang } 1313b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 131449b5e25fSSatish Balay ai = a->i; 131549b5e25fSSatish Balay aj = a->j; 131649b5e25fSSatish Balay aa = a->a; 1317d0f46423SBarry Smith m = A->rmap->N; 1318d0f46423SBarry Smith bs = A->rmap->bs; 131949b5e25fSSatish Balay mbs = a->mbs; 132049b5e25fSSatish Balay bs2 = a->bs2; 132149b5e25fSSatish Balay 13221ebc52fbSHong Zhang ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 132349b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1324e32f2f54SBarry Smith if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 132549b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 132649b5e25fSSatish Balay M = ai[i+1] - ai[i]; 132749b5e25fSSatish Balay li = l + i*bs; 132849b5e25fSSatish Balay v = aa + bs2*ai[i]; 132949b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 133049b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 13315e90f9d9SHong Zhang for (k=0; k<bs; k++) { 133249b5e25fSSatish Balay x = ri[k]; 133349b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 133449b5e25fSSatish Balay } 133549b5e25fSSatish Balay } 133649b5e25fSSatish Balay } 13371ebc52fbSHong Zhang ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1338dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 133949b5e25fSSatish Balay PetscFunctionReturn(0); 134049b5e25fSSatish Balay } 134149b5e25fSSatish Balay 13424a2ae208SSatish Balay #undef __FUNCT__ 13434a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1344dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 134549b5e25fSSatish Balay { 134649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 134749b5e25fSSatish Balay 134849b5e25fSSatish Balay PetscFunctionBegin; 134949b5e25fSSatish Balay info->block_size = a->bs2; 1350ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 13516c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 135249b5e25fSSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 135349b5e25fSSatish Balay info->assemblies = A->num_ass; 13548e58a170SBarry Smith info->mallocs = A->info.mallocs; 13557adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1356d5f3da31SBarry Smith if (A->factortype) { 135749b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 135849b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 135949b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 136049b5e25fSSatish Balay } else { 136149b5e25fSSatish Balay info->fill_ratio_given = 0; 136249b5e25fSSatish Balay info->fill_ratio_needed = 0; 136349b5e25fSSatish Balay info->factor_mallocs = 0; 136449b5e25fSSatish Balay } 136549b5e25fSSatish Balay PetscFunctionReturn(0); 136649b5e25fSSatish Balay } 136749b5e25fSSatish Balay 136849b5e25fSSatish Balay 13694a2ae208SSatish Balay #undef __FUNCT__ 13704a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1371dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 137249b5e25fSSatish Balay { 137349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1374dfbe8321SBarry Smith PetscErrorCode ierr; 137549b5e25fSSatish Balay 137649b5e25fSSatish Balay PetscFunctionBegin; 137749b5e25fSSatish Balay ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 137849b5e25fSSatish Balay PetscFunctionReturn(0); 137949b5e25fSSatish Balay } 1380dc354874SHong Zhang 13814a2ae208SSatish Balay #undef __FUNCT__ 1382985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1383985db425SBarry Smith /* 1384985db425SBarry Smith This code does not work since it only checks the upper triangular part of 1385985db425SBarry Smith the matrix. Hence it is not listed in the function table. 1386985db425SBarry Smith */ 1387985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1388dc354874SHong Zhang { 1389dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1390dfbe8321SBarry Smith PetscErrorCode ierr; 139113f74950SBarry Smith PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1392c3fca9a7SHong Zhang PetscReal atmp; 1393273d9f13SBarry Smith MatScalar *aa; 1394985db425SBarry Smith PetscScalar *x; 139513f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1396dc354874SHong Zhang 1397dc354874SHong Zhang PetscFunctionBegin; 1398e32f2f54SBarry Smith if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1399e32f2f54SBarry Smith if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1400d0f46423SBarry Smith bs = A->rmap->bs; 1401dc354874SHong Zhang aa = a->a; 1402dc354874SHong Zhang ai = a->i; 1403dc354874SHong Zhang aj = a->j; 140444117c81SHong Zhang mbs = a->mbs; 1405dc354874SHong Zhang 1406985db425SBarry Smith ierr = VecSet(v,0.0);CHKERRQ(ierr); 14071ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1408dc354874SHong Zhang ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1409e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 141044117c81SHong Zhang for (i=0; i<mbs; i++) { 1411d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1412d0f6400bSHong Zhang brow = bs*i; 141344117c81SHong Zhang for (j=0; j<ncols; j++){ 1414d0f6400bSHong Zhang bcol = bs*(*aj); 141544117c81SHong Zhang for (kcol=0; kcol<bs; kcol++){ 1416d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 141744117c81SHong Zhang for (krow=0; krow<bs; krow++){ 1418d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1419d0f6400bSHong Zhang row = brow + krow; /* row index */ 1420c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1421c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 142244117c81SHong Zhang } 142344117c81SHong Zhang } 1424d0f6400bSHong Zhang aj++; 1425dc354874SHong Zhang } 1426dc354874SHong Zhang } 14271ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1428dc354874SHong Zhang PetscFunctionReturn(0); 1429dc354874SHong Zhang } 1430