149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 306873bf2SBarry Smith #include <petsc-private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h> 6c6db04a5SJed Brown #include <petscblaslapack.h> 749b5e25fSSatish Balay 84a2ae208SSatish Balay #undef __FUNCT__ 94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 1149b5e25fSSatish Balay { 125eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 136849ba73SBarry Smith PetscErrorCode ierr; 145d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 155d0c19d7SBarry Smith const PetscInt *idx; 16db41cccfSHong Zhang PetscBT table_out,table_in; 17d94109b8SHong Zhang 18d94109b8SHong Zhang PetscFunctionBegin; 19e32f2f54SBarry Smith if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20d94109b8SHong Zhang mbs = a->mbs; 21d94109b8SHong Zhang ai = a->i; 22d94109b8SHong Zhang aj = a->j; 23d0f46423SBarry Smith bs = A->rmap->bs; 2453b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25854ce69bSBarry Smith ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 2753b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28d94109b8SHong Zhang 29d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 30d94109b8SHong Zhang isz = 0; 31db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32d94109b8SHong Zhang 33d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 34d94109b8SHong Zhang ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35d94109b8SHong Zhang ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36d94109b8SHong Zhang 37db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38dbe03f88SHong Zhang bcol_max = 0; 39d94109b8SHong Zhang for (j=0; j<n; ++j) { 40d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 41e32f2f54SBarry Smith if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) { 43dbe03f88SHong Zhang nidx[isz++] = brow; 44dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 45dbe03f88SHong Zhang } 46d94109b8SHong Zhang } 47d94109b8SHong Zhang ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 486bf464f9SBarry Smith ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49d94109b8SHong Zhang 50d94109b8SHong Zhang k = 0; 51d94109b8SHong Zhang for (j=0; j<ov; j++) { /* for each overlap */ 52db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54db41cccfSHong Zhang for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55d94109b8SHong Zhang 56d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 57d94109b8SHong Zhang for (brow=0; brow<mbs; brow++) { 58d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 59db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60d94109b8SHong Zhang for (l = start; l<end; l++) { 61d94109b8SHong Zhang bcol = aj[l]; 62d7b97159SHong Zhang if (!PetscBTLookupSet(table_out,bcol)) { 63d7b97159SHong Zhang nidx[isz++] = bcol; 64d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 65d7b97159SHong Zhang } 66d94109b8SHong Zhang } 67d94109b8SHong Zhang k++; 68d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70d94109b8SHong Zhang for (l = start; l<end; l++) { 71d94109b8SHong Zhang bcol = aj[l]; 72dbe03f88SHong Zhang if (bcol > bcol_max) break; 73db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)) { 7426fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } 79d94109b8SHong Zhang } 80d94109b8SHong Zhang } /* for each overlap */ 81d94109b8SHong Zhang 82d94109b8SHong Zhang /* expand the Index Set */ 83d94109b8SHong Zhang for (j=0; j<isz; j++) { 8426fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85d94109b8SHong Zhang } 8670b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87d94109b8SHong Zhang } 8894bacf5dSBarry Smith ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89d94109b8SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 90d94109b8SHong Zhang ierr = PetscFree(nidx2);CHKERRQ(ierr); 9194bacf5dSBarry Smith ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 925eee224dSHong Zhang PetscFunctionReturn(0); 9349b5e25fSSatish Balay } 9449b5e25fSSatish Balay 954a2ae208SSatish Balay #undef __FUNCT__ 964a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 97*8f46ffcaSHong Zhang PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) //rm 9849b5e25fSSatish Balay { 9949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 1006849ba73SBarry Smith PetscErrorCode ierr; 10113f74950SBarry Smith PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 10213f74950SBarry Smith PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 1035d0c19d7SBarry Smith PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 1045d0c19d7SBarry Smith const PetscInt *irow,*aj = a->j,*ai = a->i; 10549b5e25fSSatish Balay MatScalar *mat_a; 10649b5e25fSSatish Balay Mat C; 107ace3abfcSBarry Smith PetscBool flag,sorted; 10849b5e25fSSatish Balay 10949b5e25fSSatish Balay PetscFunctionBegin; 110*8f46ffcaSHong Zhang if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); //rm!!! 111*8f46ffcaSHong Zhang ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); //rm!!! 112*8f46ffcaSHong Zhang if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); //rm!!! 11349b5e25fSSatish Balay 11449b5e25fSSatish Balay ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 11549b5e25fSSatish Balay ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 11649b5e25fSSatish Balay 117785e854fSJed Brown ierr = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr); 11874ed9c26SBarry Smith ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 11949b5e25fSSatish Balay ssmap = smap; 120854ce69bSBarry Smith ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 12149b5e25fSSatish Balay for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 12249b5e25fSSatish Balay /* determine lens of each row */ 12349b5e25fSSatish Balay for (i=0; i<nrows; i++) { 12449b5e25fSSatish Balay kstart = ai[irow[i]]; 12549b5e25fSSatish Balay kend = kstart + a->ilen[irow[i]]; 12649b5e25fSSatish Balay lens[i] = 0; 12749b5e25fSSatish Balay for (k=kstart; k<kend; k++) { 12826fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 12949b5e25fSSatish Balay } 13049b5e25fSSatish Balay } 13149b5e25fSSatish Balay /* Create and fill new matrix */ 13249b5e25fSSatish Balay if (scall == MAT_REUSE_MATRIX) { 13349b5e25fSSatish Balay c = (Mat_SeqSBAIJ*)((*B)->data); 13449b5e25fSSatish Balay 135e32f2f54SBarry Smith if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 13613f74950SBarry Smith ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137e7e72b3dSBarry Smith if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 13813f74950SBarry Smith ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 13949b5e25fSSatish Balay C = *B; 14049b5e25fSSatish Balay } else { 141ce94432eSBarry Smith ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 142f69a0ea3SMatthew Knepley ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1437adad957SLisandro Dalcin ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 144ab93d7beSBarry Smith ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 14549b5e25fSSatish Balay } 14649b5e25fSSatish Balay c = (Mat_SeqSBAIJ*)(C->data); 14749b5e25fSSatish Balay for (i=0; i<nrows; i++) { 14849b5e25fSSatish Balay row = irow[i]; 14949b5e25fSSatish Balay kstart = ai[row]; 15049b5e25fSSatish Balay kend = kstart + a->ilen[row]; 15149b5e25fSSatish Balay mat_i = c->i[i]; 15249b5e25fSSatish Balay mat_j = c->j + mat_i; 15349b5e25fSSatish Balay mat_a = c->a + mat_i*bs2; 15449b5e25fSSatish Balay mat_ilen = c->ilen + i; 15549b5e25fSSatish Balay for (k=kstart; k<kend; k++) { 15649b5e25fSSatish Balay if ((tcol=ssmap[a->j[k]])) { 15749b5e25fSSatish Balay *mat_j++ = tcol - 1; 15849b5e25fSSatish Balay ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 15949b5e25fSSatish Balay mat_a += bs2; 16049b5e25fSSatish Balay (*mat_ilen)++; 16149b5e25fSSatish Balay } 16249b5e25fSSatish Balay } 16349b5e25fSSatish Balay } 16449b5e25fSSatish Balay 16549b5e25fSSatish Balay /* Free work space */ 16649b5e25fSSatish Balay ierr = PetscFree(smap);CHKERRQ(ierr); 16749b5e25fSSatish Balay ierr = PetscFree(lens);CHKERRQ(ierr); 16849b5e25fSSatish Balay ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 16949b5e25fSSatish Balay ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 17049b5e25fSSatish Balay 17149b5e25fSSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 17249b5e25fSSatish Balay *B = C; 17349b5e25fSSatish Balay PetscFunctionReturn(0); 17449b5e25fSSatish Balay } 17549b5e25fSSatish Balay 1764a2ae208SSatish Balay #undef __FUNCT__ 1774a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 1784aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 17949b5e25fSSatish Balay { 18049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 18149b5e25fSSatish Balay IS is1; 1826849ba73SBarry Smith PetscErrorCode ierr; 1835d0c19d7SBarry Smith PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 1845d0c19d7SBarry Smith const PetscInt *irow; 18549b5e25fSSatish Balay 18649b5e25fSSatish Balay PetscFunctionBegin; 187*8f46ffcaSHong Zhang if (isrow != iscol) { 188*8f46ffcaSHong Zhang PetscBool isequal; 189*8f46ffcaSHong Zhang ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 190*8f46ffcaSHong Zhang if (!isequal) 191*8f46ffcaSHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); 192*8f46ffcaSHong Zhang } 19349b5e25fSSatish Balay 194*8f46ffcaSHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); //isrow == iscol, so delete below!!! 19549b5e25fSSatish Balay ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 19649b5e25fSSatish Balay 19749b5e25fSSatish Balay /* Verify if the indices corespond to each element in a block 198*8f46ffcaSHong Zhang and form the IS with compressed IS -- also sort it!!! */ 199dcca6d9dSJed Brown ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 20074ed9c26SBarry Smith ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 20149b5e25fSSatish Balay for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 20249b5e25fSSatish Balay 20349b5e25fSSatish Balay count = 0; 20449b5e25fSSatish Balay for (i=0; i<a->mbs; i++) { 205e32f2f54SBarry Smith if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 20649b5e25fSSatish Balay if (vary[i]==bs) iary[count++] = i; 20749b5e25fSSatish Balay } 20849b5e25fSSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 20970b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 21074ed9c26SBarry Smith ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 21149b5e25fSSatish Balay 212*8f46ffcaSHong Zhang ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); //remove a is1!!! 2136bf464f9SBarry Smith ierr = ISDestroy(&is1);CHKERRQ(ierr); 21449b5e25fSSatish Balay PetscFunctionReturn(0); 21549b5e25fSSatish Balay } 21649b5e25fSSatish Balay 2174a2ae208SSatish Balay #undef __FUNCT__ 2184a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 21913f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 22049b5e25fSSatish Balay { 2216849ba73SBarry Smith PetscErrorCode ierr; 22213f74950SBarry Smith PetscInt i; 22349b5e25fSSatish Balay 22449b5e25fSSatish Balay PetscFunctionBegin; 22549b5e25fSSatish Balay if (scall == MAT_INITIAL_MATRIX) { 226854ce69bSBarry Smith ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 22749b5e25fSSatish Balay } 22849b5e25fSSatish Balay 22949b5e25fSSatish Balay for (i=0; i<n; i++) { 2304aa3045dSJed Brown ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 23149b5e25fSSatish Balay } 23249b5e25fSSatish Balay PetscFunctionReturn(0); 23349b5e25fSSatish Balay } 23449b5e25fSSatish Balay 23549b5e25fSSatish Balay /* -------------------------------------------------------*/ 23649b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 23749b5e25fSSatish Balay /* -------------------------------------------------------*/ 23849b5e25fSSatish Balay 2394a2ae208SSatish Balay #undef __FUNCT__ 2404a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2" 241dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 24249b5e25fSSatish Balay { 24349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 244d9ca1df4SBarry Smith PetscScalar *z,x1,x2,zero=0.0; 245d9ca1df4SBarry Smith const PetscScalar *x,*xb; 246d9ca1df4SBarry Smith const MatScalar *v; 2476849ba73SBarry Smith PetscErrorCode ierr; 248d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 249d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 25098c9bda7SSatish Balay PetscInt nonzerorow=0; 25149b5e25fSSatish Balay 25249b5e25fSSatish Balay PetscFunctionBegin; 2532dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 254d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2551ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 25649b5e25fSSatish Balay 25749b5e25fSSatish Balay v = a->a; 25849b5e25fSSatish Balay xb = x; 25949b5e25fSSatish Balay 26049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 26149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 26249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 26349b5e25fSSatish Balay ib = aj + *ai; 264831a3094SHong Zhang jmin = 0; 26598c9bda7SSatish Balay nonzerorow += (n>0); 2667fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 26749b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 26849b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 269831a3094SHong Zhang v += 4; jmin++; 2707fbae186SHong Zhang } 271444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 272444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 273831a3094SHong Zhang for (j=jmin; j<n; j++) { 27449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 27549b5e25fSSatish Balay cval = ib[j]*2; 27649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 27749b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 27849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 27949b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 28049b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 28149b5e25fSSatish Balay v += 4; 28249b5e25fSSatish Balay } 28349b5e25fSSatish Balay xb +=2; ai++; 28449b5e25fSSatish Balay } 28549b5e25fSSatish Balay 286d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2871ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 288dc0b31edSSatish Balay ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 28949b5e25fSSatish Balay PetscFunctionReturn(0); 29049b5e25fSSatish Balay } 29149b5e25fSSatish Balay 2924a2ae208SSatish Balay #undef __FUNCT__ 2934a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3" 294dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 29549b5e25fSSatish Balay { 29649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 297d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,zero=0.0; 298d9ca1df4SBarry Smith const PetscScalar *x,*xb; 299d9ca1df4SBarry Smith const MatScalar *v; 3006849ba73SBarry Smith PetscErrorCode ierr; 301d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 302d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 30398c9bda7SSatish Balay PetscInt nonzerorow=0; 30449b5e25fSSatish Balay 30549b5e25fSSatish Balay PetscFunctionBegin; 3062dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 307d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3081ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 30949b5e25fSSatish Balay 31049b5e25fSSatish Balay v = a->a; 31149b5e25fSSatish Balay xb = x; 31249b5e25fSSatish Balay 31349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 31449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 31549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 31649b5e25fSSatish Balay ib = aj + *ai; 317831a3094SHong Zhang jmin = 0; 31898c9bda7SSatish Balay nonzerorow += (n>0); 3197fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 32049b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 32149b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 32249b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 323831a3094SHong Zhang v += 9; jmin++; 3247fbae186SHong Zhang } 325444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 326444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 327831a3094SHong Zhang for (j=jmin; j<n; j++) { 32849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32949b5e25fSSatish Balay cval = ib[j]*3; 33049b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 33149b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 33249b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 33349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 33449b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 33549b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 33649b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 33749b5e25fSSatish Balay v += 9; 33849b5e25fSSatish Balay } 33949b5e25fSSatish Balay xb +=3; ai++; 34049b5e25fSSatish Balay } 34149b5e25fSSatish Balay 342d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3431ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 344dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 34549b5e25fSSatish Balay PetscFunctionReturn(0); 34649b5e25fSSatish Balay } 34749b5e25fSSatish Balay 3484a2ae208SSatish Balay #undef __FUNCT__ 3494a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4" 350dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 35149b5e25fSSatish Balay { 35249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 353d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,zero=0.0; 354d9ca1df4SBarry Smith const PetscScalar *x,*xb; 355d9ca1df4SBarry Smith const MatScalar *v; 3566849ba73SBarry Smith PetscErrorCode ierr; 357d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 358d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 35998c9bda7SSatish Balay PetscInt nonzerorow = 0; 36049b5e25fSSatish Balay 36149b5e25fSSatish Balay PetscFunctionBegin; 3622dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 363d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3641ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 36549b5e25fSSatish Balay 36649b5e25fSSatish Balay v = a->a; 36749b5e25fSSatish Balay xb = x; 36849b5e25fSSatish Balay 36949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 37049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 37149b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 37249b5e25fSSatish Balay ib = aj + *ai; 373831a3094SHong Zhang jmin = 0; 37498c9bda7SSatish Balay nonzerorow += (n>0); 3757fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 37649b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 37749b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 37849b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 37949b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 380831a3094SHong Zhang v += 16; jmin++; 3817fbae186SHong Zhang } 382444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 383444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 384831a3094SHong Zhang for (j=jmin; j<n; j++) { 38549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 38649b5e25fSSatish Balay cval = ib[j]*4; 38749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 38849b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 38949b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 39049b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 39149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 39249b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 39349b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 39449b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 39549b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 39649b5e25fSSatish Balay v += 16; 39749b5e25fSSatish Balay } 39849b5e25fSSatish Balay xb +=4; ai++; 39949b5e25fSSatish Balay } 40049b5e25fSSatish Balay 401d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4021ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 403dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 40449b5e25fSSatish Balay PetscFunctionReturn(0); 40549b5e25fSSatish Balay } 40649b5e25fSSatish Balay 4074a2ae208SSatish Balay #undef __FUNCT__ 4084a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5" 409dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 41049b5e25fSSatish Balay { 41149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 412d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 413d9ca1df4SBarry Smith const PetscScalar *x,*xb; 414d9ca1df4SBarry Smith const MatScalar *v; 4156849ba73SBarry Smith PetscErrorCode ierr; 416d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 417d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 41898c9bda7SSatish Balay PetscInt nonzerorow=0; 41949b5e25fSSatish Balay 42049b5e25fSSatish Balay PetscFunctionBegin; 4212dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 422d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4231ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 42449b5e25fSSatish Balay 42549b5e25fSSatish Balay v = a->a; 42649b5e25fSSatish Balay xb = x; 42749b5e25fSSatish Balay 42849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 42949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 43049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 43149b5e25fSSatish Balay ib = aj + *ai; 432831a3094SHong Zhang jmin = 0; 43398c9bda7SSatish Balay nonzerorow += (n>0); 4347fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 43549b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 43649b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 43749b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 43849b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 43949b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 440831a3094SHong Zhang v += 25; jmin++; 4417fbae186SHong Zhang } 442444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 443444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 444831a3094SHong Zhang for (j=jmin; j<n; j++) { 44549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 44649b5e25fSSatish Balay cval = ib[j]*5; 44749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 44849b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 44949b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 45049b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 45149b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 45249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 45349b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 45449b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 45549b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 45649b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 45749b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 45849b5e25fSSatish Balay v += 25; 45949b5e25fSSatish Balay } 46049b5e25fSSatish Balay xb +=5; ai++; 46149b5e25fSSatish Balay } 46249b5e25fSSatish Balay 463d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4641ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 465dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 46649b5e25fSSatish Balay PetscFunctionReturn(0); 46749b5e25fSSatish Balay } 46849b5e25fSSatish Balay 46949b5e25fSSatish Balay 4704a2ae208SSatish Balay #undef __FUNCT__ 4714a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6" 472dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 47349b5e25fSSatish Balay { 47449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 475d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 476d9ca1df4SBarry Smith const PetscScalar *x,*xb; 477d9ca1df4SBarry Smith const MatScalar *v; 4786849ba73SBarry Smith PetscErrorCode ierr; 479d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 480d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 48198c9bda7SSatish Balay PetscInt nonzerorow=0; 48249b5e25fSSatish Balay 48349b5e25fSSatish Balay PetscFunctionBegin; 4842dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 485d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4861ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 48749b5e25fSSatish Balay 48849b5e25fSSatish Balay v = a->a; 48949b5e25fSSatish Balay xb = x; 49049b5e25fSSatish Balay 49149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 49249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 49349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 49449b5e25fSSatish Balay ib = aj + *ai; 495831a3094SHong Zhang jmin = 0; 49698c9bda7SSatish Balay nonzerorow += (n>0); 4977fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 49849b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 49949b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 50049b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 50149b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 50249b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 50349b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 504831a3094SHong Zhang v += 36; jmin++; 5057fbae186SHong Zhang } 506444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 507444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 508831a3094SHong Zhang for (j=jmin; j<n; j++) { 50949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 51049b5e25fSSatish Balay cval = ib[j]*6; 51149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 51249b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 51349b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 51449b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 51549b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 51649b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 51749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 51849b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 51949b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 52049b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 52149b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 52249b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 52349b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 52449b5e25fSSatish Balay v += 36; 52549b5e25fSSatish Balay } 52649b5e25fSSatish Balay xb +=6; ai++; 52749b5e25fSSatish Balay } 52849b5e25fSSatish Balay 529d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5301ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 531dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 53249b5e25fSSatish Balay PetscFunctionReturn(0); 53349b5e25fSSatish Balay } 5344a2ae208SSatish Balay #undef __FUNCT__ 5354a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7" 536dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 53749b5e25fSSatish Balay { 53849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 539d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 540d9ca1df4SBarry Smith const PetscScalar *x,*xb; 541d9ca1df4SBarry Smith const MatScalar *v; 5426849ba73SBarry Smith PetscErrorCode ierr; 543d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 544d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 54598c9bda7SSatish Balay PetscInt nonzerorow=0; 54649b5e25fSSatish Balay 54749b5e25fSSatish Balay PetscFunctionBegin; 5482dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 549d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5501ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 55149b5e25fSSatish Balay 55249b5e25fSSatish Balay v = a->a; 55349b5e25fSSatish Balay xb = x; 55449b5e25fSSatish Balay 55549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 55649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 55749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 55849b5e25fSSatish Balay ib = aj + *ai; 559831a3094SHong Zhang jmin = 0; 56098c9bda7SSatish Balay nonzerorow += (n>0); 5617fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 56249b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 56349b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 56449b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 56549b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 56649b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 56749b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 56849b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 569831a3094SHong Zhang v += 49; jmin++; 5707fbae186SHong Zhang } 571444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 572444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 573831a3094SHong Zhang for (j=jmin; j<n; j++) { 57449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 57549b5e25fSSatish Balay cval = ib[j]*7; 57649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 57749b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 57849b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 57949b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 58049b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 58149b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 58249b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 58349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 58449b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 58549b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 58649b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 58749b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 58849b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 58949b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 59049b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 59149b5e25fSSatish Balay v += 49; 59249b5e25fSSatish Balay } 59349b5e25fSSatish Balay xb +=7; ai++; 59449b5e25fSSatish Balay } 595d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5961ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 597dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 59849b5e25fSSatish Balay PetscFunctionReturn(0); 59949b5e25fSSatish Balay } 60049b5e25fSSatish Balay 60149b5e25fSSatish Balay /* 60249b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 60349b5e25fSSatish Balay */ 6044a2ae208SSatish Balay #undef __FUNCT__ 6054a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N" 606dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 60749b5e25fSSatish Balay { 60849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 609d9ca1df4SBarry Smith PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 610d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 611d9ca1df4SBarry Smith const MatScalar *v; 612dfbe8321SBarry Smith PetscErrorCode ierr; 613d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 614d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 61598c9bda7SSatish Balay PetscInt nonzerorow=0; 61649b5e25fSSatish Balay 61749b5e25fSSatish Balay PetscFunctionBegin; 6182dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 619d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 6201ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 62149b5e25fSSatish Balay 62249b5e25fSSatish Balay aj = a->j; 62349b5e25fSSatish Balay v = a->a; 62449b5e25fSSatish Balay ii = a->i; 62549b5e25fSSatish Balay 62649b5e25fSSatish Balay if (!a->mult_work) { 627854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 62849b5e25fSSatish Balay } 62949b5e25fSSatish Balay work = a->mult_work; 63049b5e25fSSatish Balay 63149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 63249b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 63349b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 63498c9bda7SSatish Balay nonzerorow += (n>0); 63549b5e25fSSatish Balay 63649b5e25fSSatish Balay /* upper triangular part */ 63749b5e25fSSatish Balay for (j=0; j<n; j++) { 63849b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 63949b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 64049b5e25fSSatish Balay workt += bs; 64149b5e25fSSatish Balay } 64249b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 64396b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 64449b5e25fSSatish Balay 64549b5e25fSSatish Balay /* strict lower triangular part */ 646831a3094SHong Zhang idx = aj+ii[0]; 647831a3094SHong Zhang if (*idx == i) { 64896b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 649831a3094SHong Zhang } 65096b9376eSHong Zhang 65149b5e25fSSatish Balay if (ncols > 0) { 65249b5e25fSSatish Balay workt = work; 65387828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 65496b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 655831a3094SHong Zhang for (j=0; j<n; j++) { 656831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 65749b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 65849b5e25fSSatish Balay workt += bs; 65949b5e25fSSatish Balay } 66049b5e25fSSatish Balay } 66149b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 66249b5e25fSSatish Balay } 66349b5e25fSSatish Balay 664d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 6651ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 666dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 66749b5e25fSSatish Balay PetscFunctionReturn(0); 66849b5e25fSSatish Balay } 66949b5e25fSSatish Balay 6704a2ae208SSatish Balay #undef __FUNCT__ 6714a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 672dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 67349b5e25fSSatish Balay { 67449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 675d9ca1df4SBarry Smith PetscScalar *z,x1; 676d9ca1df4SBarry Smith const PetscScalar *x,*xb; 677d9ca1df4SBarry Smith const MatScalar *v; 6786849ba73SBarry Smith PetscErrorCode ierr; 679d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 680d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 68198c9bda7SSatish Balay PetscInt nonzerorow=0; 68249b5e25fSSatish Balay 68349b5e25fSSatish Balay PetscFunctionBegin; 684343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 685d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 6861ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 68749b5e25fSSatish Balay v = a->a; 68849b5e25fSSatish Balay xb = x; 68949b5e25fSSatish Balay 69049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 69149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 69249b5e25fSSatish Balay x1 = xb[0]; 69349b5e25fSSatish Balay ib = aj + *ai; 694831a3094SHong Zhang jmin = 0; 69598c9bda7SSatish Balay nonzerorow += (n>0); 696831a3094SHong Zhang if (*ib == i) { /* (diag of A)*x */ 697831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 698831a3094SHong Zhang } 699831a3094SHong Zhang for (j=jmin; j<n; j++) { 70049b5e25fSSatish Balay cval = *ib; 70149b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 70249b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 70349b5e25fSSatish Balay } 70449b5e25fSSatish Balay xb++; ai++; 70549b5e25fSSatish Balay } 70649b5e25fSSatish Balay 707d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 708bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 70949b5e25fSSatish Balay 710dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 71149b5e25fSSatish Balay PetscFunctionReturn(0); 71249b5e25fSSatish Balay } 71349b5e25fSSatish Balay 7144a2ae208SSatish Balay #undef __FUNCT__ 7154a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 716dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 71749b5e25fSSatish Balay { 71849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 719d9ca1df4SBarry Smith PetscScalar *z,x1,x2; 720d9ca1df4SBarry Smith const PetscScalar *x,*xb; 721d9ca1df4SBarry Smith const MatScalar *v; 7226849ba73SBarry Smith PetscErrorCode ierr; 723d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 724d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 72598c9bda7SSatish Balay PetscInt nonzerorow=0; 72649b5e25fSSatish Balay 72749b5e25fSSatish Balay PetscFunctionBegin; 728343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 729d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7301ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 73149b5e25fSSatish Balay 73249b5e25fSSatish Balay v = a->a; 73349b5e25fSSatish Balay xb = x; 73449b5e25fSSatish Balay 73549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 73649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 73749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 73849b5e25fSSatish Balay ib = aj + *ai; 739831a3094SHong Zhang jmin = 0; 74098c9bda7SSatish Balay nonzerorow += (n>0); 7417fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 74249b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 74349b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 744831a3094SHong Zhang v += 4; jmin++; 7457fbae186SHong Zhang } 746444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 747444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 748831a3094SHong Zhang for (j=jmin; j<n; j++) { 74949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 75049b5e25fSSatish Balay cval = ib[j]*2; 75149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 75249b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 75349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 75449b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 75549b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 75649b5e25fSSatish Balay v += 4; 75749b5e25fSSatish Balay } 75849b5e25fSSatish Balay xb +=2; ai++; 75949b5e25fSSatish Balay } 760d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 761bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 76249b5e25fSSatish Balay 763dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 76449b5e25fSSatish Balay PetscFunctionReturn(0); 76549b5e25fSSatish Balay } 76649b5e25fSSatish Balay 7674a2ae208SSatish Balay #undef __FUNCT__ 7684a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 769dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 77049b5e25fSSatish Balay { 77149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 772d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3; 773d9ca1df4SBarry Smith const PetscScalar *x,*xb; 774d9ca1df4SBarry Smith const MatScalar *v; 7756849ba73SBarry Smith PetscErrorCode ierr; 776d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 777d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 77898c9bda7SSatish Balay PetscInt nonzerorow=0; 77949b5e25fSSatish Balay 78049b5e25fSSatish Balay PetscFunctionBegin; 781343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 782d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7831ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 78449b5e25fSSatish Balay 78549b5e25fSSatish Balay v = a->a; 78649b5e25fSSatish Balay xb = x; 78749b5e25fSSatish Balay 78849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 78949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 79049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 79149b5e25fSSatish Balay ib = aj + *ai; 792831a3094SHong Zhang jmin = 0; 79398c9bda7SSatish Balay nonzerorow += (n>0); 7947fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 79549b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 79649b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 79749b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 798831a3094SHong Zhang v += 9; jmin++; 7997fbae186SHong Zhang } 800444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 801444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 802831a3094SHong Zhang for (j=jmin; j<n; j++) { 80349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 80449b5e25fSSatish Balay cval = ib[j]*3; 80549b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 80649b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 80749b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 80849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 80949b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 81049b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 81149b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 81249b5e25fSSatish Balay v += 9; 81349b5e25fSSatish Balay } 81449b5e25fSSatish Balay xb +=3; ai++; 81549b5e25fSSatish Balay } 81649b5e25fSSatish Balay 817d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 818bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 81949b5e25fSSatish Balay 820dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 82149b5e25fSSatish Balay PetscFunctionReturn(0); 82249b5e25fSSatish Balay } 82349b5e25fSSatish Balay 8244a2ae208SSatish Balay #undef __FUNCT__ 8254a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 826dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 82749b5e25fSSatish Balay { 82849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 829d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4; 830d9ca1df4SBarry Smith const PetscScalar *x,*xb; 831d9ca1df4SBarry Smith const MatScalar *v; 8326849ba73SBarry Smith PetscErrorCode ierr; 833d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 834d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 83598c9bda7SSatish Balay PetscInt nonzerorow=0; 83649b5e25fSSatish Balay 83749b5e25fSSatish Balay PetscFunctionBegin; 838343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 839d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8401ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 84149b5e25fSSatish Balay 84249b5e25fSSatish Balay v = a->a; 84349b5e25fSSatish Balay xb = x; 84449b5e25fSSatish Balay 84549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 84649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 84749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 84849b5e25fSSatish Balay ib = aj + *ai; 849831a3094SHong Zhang jmin = 0; 85098c9bda7SSatish Balay nonzerorow += (n>0); 8517fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 85249b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 85349b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 85449b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 85549b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 856831a3094SHong Zhang v += 16; jmin++; 8577fbae186SHong Zhang } 858444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 859444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 860831a3094SHong Zhang for (j=jmin; j<n; j++) { 86149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 86249b5e25fSSatish Balay cval = ib[j]*4; 86349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 86449b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 86549b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 86649b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 86749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 86849b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 86949b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 87049b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 87149b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 87249b5e25fSSatish Balay v += 16; 87349b5e25fSSatish Balay } 87449b5e25fSSatish Balay xb +=4; ai++; 87549b5e25fSSatish Balay } 87649b5e25fSSatish Balay 877d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 878bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 87949b5e25fSSatish Balay 880dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 88149b5e25fSSatish Balay PetscFunctionReturn(0); 88249b5e25fSSatish Balay } 88349b5e25fSSatish Balay 8844a2ae208SSatish Balay #undef __FUNCT__ 8854a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 886dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 88749b5e25fSSatish Balay { 88849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 889d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5; 890d9ca1df4SBarry Smith const PetscScalar *x,*xb; 891d9ca1df4SBarry Smith const MatScalar *v; 8926849ba73SBarry Smith PetscErrorCode ierr; 893d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 894d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 89598c9bda7SSatish Balay PetscInt nonzerorow=0; 89649b5e25fSSatish Balay 89749b5e25fSSatish Balay PetscFunctionBegin; 898343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 899d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9001ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 90149b5e25fSSatish Balay 90249b5e25fSSatish Balay v = a->a; 90349b5e25fSSatish Balay xb = x; 90449b5e25fSSatish Balay 90549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 90649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 90749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 90849b5e25fSSatish Balay ib = aj + *ai; 909831a3094SHong Zhang jmin = 0; 91098c9bda7SSatish Balay nonzerorow += (n>0); 9117fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 91249b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 91349b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 91449b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 91549b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 91649b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 917831a3094SHong Zhang v += 25; jmin++; 9187fbae186SHong Zhang } 919444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 920444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 921831a3094SHong Zhang for (j=jmin; j<n; j++) { 92249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 92349b5e25fSSatish Balay cval = ib[j]*5; 92449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 92549b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 92649b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 92749b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 92849b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 92949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 93049b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 93149b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 93249b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 93349b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 93449b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 93549b5e25fSSatish Balay v += 25; 93649b5e25fSSatish Balay } 93749b5e25fSSatish Balay xb +=5; ai++; 93849b5e25fSSatish Balay } 93949b5e25fSSatish Balay 940d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 941bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 94249b5e25fSSatish Balay 943dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 94449b5e25fSSatish Balay PetscFunctionReturn(0); 94549b5e25fSSatish Balay } 9464a2ae208SSatish Balay #undef __FUNCT__ 9474a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 948dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 94949b5e25fSSatish Balay { 95049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 951d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6; 952d9ca1df4SBarry Smith const PetscScalar *x,*xb; 953d9ca1df4SBarry Smith const MatScalar *v; 9546849ba73SBarry Smith PetscErrorCode ierr; 955d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 956d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 95798c9bda7SSatish Balay PetscInt nonzerorow=0; 95849b5e25fSSatish Balay 95949b5e25fSSatish Balay PetscFunctionBegin; 960343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 961d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9621ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 96349b5e25fSSatish Balay 96449b5e25fSSatish Balay v = a->a; 96549b5e25fSSatish Balay xb = x; 96649b5e25fSSatish Balay 96749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 96849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 96949b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 97049b5e25fSSatish Balay ib = aj + *ai; 971831a3094SHong Zhang jmin = 0; 97298c9bda7SSatish Balay nonzerorow += (n>0); 9737fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 97449b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 97549b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 97649b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 97749b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 97849b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 97949b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 980831a3094SHong Zhang v += 36; jmin++; 9817fbae186SHong Zhang } 982444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 983444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 984831a3094SHong Zhang for (j=jmin; j<n; j++) { 98549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 98649b5e25fSSatish Balay cval = ib[j]*6; 98749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 98849b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 98949b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 99049b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 99149b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 99249b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 99349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 99449b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 99549b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 99649b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 99749b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 99849b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 99949b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 100049b5e25fSSatish Balay v += 36; 100149b5e25fSSatish Balay } 100249b5e25fSSatish Balay xb +=6; ai++; 100349b5e25fSSatish Balay } 100449b5e25fSSatish Balay 1005d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1006bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 100749b5e25fSSatish Balay 1008dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 100949b5e25fSSatish Balay PetscFunctionReturn(0); 101049b5e25fSSatish Balay } 101149b5e25fSSatish Balay 10124a2ae208SSatish Balay #undef __FUNCT__ 10134a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1014dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 101549b5e25fSSatish Balay { 101649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1017d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1018d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1019d9ca1df4SBarry Smith const MatScalar *v; 10206849ba73SBarry Smith PetscErrorCode ierr; 1021d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1022d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 102398c9bda7SSatish Balay PetscInt nonzerorow=0; 102449b5e25fSSatish Balay 102549b5e25fSSatish Balay PetscFunctionBegin; 1026343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1027d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10281ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 102949b5e25fSSatish Balay 103049b5e25fSSatish Balay v = a->a; 103149b5e25fSSatish Balay xb = x; 103249b5e25fSSatish Balay 103349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 103449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 103549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 103649b5e25fSSatish Balay ib = aj + *ai; 1037831a3094SHong Zhang jmin = 0; 103898c9bda7SSatish Balay nonzerorow += (n>0); 10397fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 104049b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 104149b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 104249b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 104349b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 104449b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 104549b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 104649b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1047831a3094SHong Zhang v += 49; jmin++; 10487fbae186SHong Zhang } 1049444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1050444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1051831a3094SHong Zhang for (j=jmin; j<n; j++) { 105249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 105349b5e25fSSatish Balay cval = ib[j]*7; 105449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 105549b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 105649b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 105749b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 105849b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 105949b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 106049b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 106149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 106249b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 106349b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 106449b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 106549b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 106649b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 106749b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 106849b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 106949b5e25fSSatish Balay v += 49; 107049b5e25fSSatish Balay } 107149b5e25fSSatish Balay xb +=7; ai++; 107249b5e25fSSatish Balay } 107349b5e25fSSatish Balay 1074d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1075bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 107649b5e25fSSatish Balay 1077dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 107849b5e25fSSatish Balay PetscFunctionReturn(0); 107949b5e25fSSatish Balay } 108049b5e25fSSatish Balay 10814a2ae208SSatish Balay #undef __FUNCT__ 10824a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1083dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 108449b5e25fSSatish Balay { 108549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1086d9ca1df4SBarry Smith PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1087d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 1088d9ca1df4SBarry Smith const MatScalar *v; 1089dfbe8321SBarry Smith PetscErrorCode ierr; 1090d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1091d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 109298c9bda7SSatish Balay PetscInt nonzerorow=0; 109349b5e25fSSatish Balay 109449b5e25fSSatish Balay PetscFunctionBegin; 1095343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1096d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 10971ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 109849b5e25fSSatish Balay 109949b5e25fSSatish Balay aj = a->j; 110049b5e25fSSatish Balay v = a->a; 110149b5e25fSSatish Balay ii = a->i; 110249b5e25fSSatish Balay 110349b5e25fSSatish Balay if (!a->mult_work) { 1104854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 110549b5e25fSSatish Balay } 110649b5e25fSSatish Balay work = a->mult_work; 110749b5e25fSSatish Balay 110849b5e25fSSatish Balay 110949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 111049b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 111149b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 111298c9bda7SSatish Balay nonzerorow += (n>0); 111349b5e25fSSatish Balay 111449b5e25fSSatish Balay /* upper triangular part */ 111549b5e25fSSatish Balay for (j=0; j<n; j++) { 111649b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 111749b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 111849b5e25fSSatish Balay workt += bs; 111949b5e25fSSatish Balay } 112049b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 112196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 112249b5e25fSSatish Balay 112349b5e25fSSatish Balay /* strict lower triangular part */ 1124831a3094SHong Zhang idx = aj+ii[0]; 1125831a3094SHong Zhang if (*idx == i) { 112696b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1127831a3094SHong Zhang } 112849b5e25fSSatish Balay if (ncols > 0) { 112949b5e25fSSatish Balay workt = work; 113087828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 113196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1132831a3094SHong Zhang for (j=0; j<n; j++) { 1133831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 113449b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 113549b5e25fSSatish Balay workt += bs; 113649b5e25fSSatish Balay } 113749b5e25fSSatish Balay } 113849b5e25fSSatish Balay 113949b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 114049b5e25fSSatish Balay } 114149b5e25fSSatish Balay 1142d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1143bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 114449b5e25fSSatish Balay 1145dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 114649b5e25fSSatish Balay PetscFunctionReturn(0); 114749b5e25fSSatish Balay } 114849b5e25fSSatish Balay 11494a2ae208SSatish Balay #undef __FUNCT__ 11504a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ" 1151f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 115249b5e25fSSatish Balay { 115349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1154f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1155efee365bSSatish Balay PetscErrorCode ierr; 1156c5df96a5SBarry Smith PetscBLASInt one = 1,totalnz; 115749b5e25fSSatish Balay 115849b5e25fSSatish Balay PetscFunctionBegin; 1159c5df96a5SBarry Smith ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 11608b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1161efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 116249b5e25fSSatish Balay PetscFunctionReturn(0); 116349b5e25fSSatish Balay } 116449b5e25fSSatish Balay 11654a2ae208SSatish Balay #undef __FUNCT__ 11664a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ" 1167dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 116849b5e25fSSatish Balay { 116949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1170d9ca1df4SBarry Smith const MatScalar *v = a->a; 117149b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1172d9ca1df4SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 11736849ba73SBarry Smith PetscErrorCode ierr; 1174d9ca1df4SBarry Smith const PetscInt *aj=a->j,*col; 117549b5e25fSSatish Balay 117649b5e25fSSatish Balay PetscFunctionBegin; 117749b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 117849b5e25fSSatish Balay for (k=0; k<mbs; k++) { 117949b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1180831a3094SHong Zhang col = aj + jmin; 1181831a3094SHong Zhang if (*col == k) { /* diagonal block */ 118249b5e25fSSatish Balay for (i=0; i<bs2; i++) { 118349b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 118449b5e25fSSatish Balay } 1185831a3094SHong Zhang jmin++; 1186831a3094SHong Zhang } 1187831a3094SHong Zhang for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 118849b5e25fSSatish Balay for (i=0; i<bs2; i++) { 118949b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 119049b5e25fSSatish Balay } 119149b5e25fSSatish Balay } 119249b5e25fSSatish Balay } 11938f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2*sum_off); 11940b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1195dcca6d9dSJed Brown ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 11960b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 11970b8dc8d2SHong Zhang il[0] = 0; 119849b5e25fSSatish Balay 119949b5e25fSSatish Balay *norm = 0.0; 120049b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 120149b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 120249b5e25fSSatish Balay /*-- col sum --*/ 120349b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1204831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1205831a3094SHong Zhang at step k */ 120649b5e25fSSatish Balay while (i<mbs) { 120749b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 120849b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 120949b5e25fSSatish Balay for (j=0; j<bs; j++) { 121049b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 121149b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 121249b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 121349b5e25fSSatish Balay } 121449b5e25fSSatish Balay } 121549b5e25fSSatish Balay /* update il, jl */ 1216831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1217831a3094SHong Zhang jmax = a->i[i+1]; 121849b5e25fSSatish Balay if (jmin < jmax) { 121949b5e25fSSatish Balay il[i] = jmin; 122049b5e25fSSatish Balay j = a->j[jmin]; 122149b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 122249b5e25fSSatish Balay } 122349b5e25fSSatish Balay i = nexti; 122449b5e25fSSatish Balay } 122549b5e25fSSatish Balay /*-- row sum --*/ 122649b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 122749b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 122849b5e25fSSatish Balay for (j=0; j<bs; j++) { 122949b5e25fSSatish Balay v = a->a + i*bs2 + j; 123049b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 12310b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 123249b5e25fSSatish Balay } 123349b5e25fSSatish Balay } 123449b5e25fSSatish Balay } 123549b5e25fSSatish Balay /* add k_th block row to il, jl */ 1236831a3094SHong Zhang col = aj+jmin; 1237831a3094SHong Zhang if (*col == k) jmin++; 123849b5e25fSSatish Balay if (jmin < jmax) { 123949b5e25fSSatish Balay il[k] = jmin; 12400b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 124149b5e25fSSatish Balay } 124249b5e25fSSatish Balay for (j=0; j<bs; j++) { 124349b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 124449b5e25fSSatish Balay } 124549b5e25fSSatish Balay } 124674ed9c26SBarry Smith ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1247f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 124849b5e25fSSatish Balay PetscFunctionReturn(0); 124949b5e25fSSatish Balay } 125049b5e25fSSatish Balay 12514a2ae208SSatish Balay #undef __FUNCT__ 12524a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ" 1253ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 125449b5e25fSSatish Balay { 125549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1256dfbe8321SBarry Smith PetscErrorCode ierr; 125749b5e25fSSatish Balay 125849b5e25fSSatish Balay PetscFunctionBegin; 125949b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1260d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1261ef511fbeSHong Zhang *flg = PETSC_FALSE; 1262ef511fbeSHong Zhang PetscFunctionReturn(0); 126349b5e25fSSatish Balay } 126449b5e25fSSatish Balay 126549b5e25fSSatish Balay /* if the a->i are the same */ 126613f74950SBarry Smith ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 126726fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 126849b5e25fSSatish Balay 126949b5e25fSSatish Balay /* if a->j are the same */ 127013f74950SBarry Smith ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 127126fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 127226fbe8dcSKarl Rupp 127349b5e25fSSatish Balay /* if a->a are the same */ 1274d0f46423SBarry Smith ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1275935af2e7SHong Zhang PetscFunctionReturn(0); 127649b5e25fSSatish Balay } 127749b5e25fSSatish Balay 12784a2ae208SSatish Balay #undef __FUNCT__ 12794a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1280dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 128149b5e25fSSatish Balay { 128249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1283dfbe8321SBarry Smith PetscErrorCode ierr; 1284d9ca1df4SBarry Smith PetscInt i,j,k,row,bs,ambs,bs2; 1285d9ca1df4SBarry Smith const PetscInt *ai,*aj; 128687828ca2SBarry Smith PetscScalar *x,zero = 0.0; 1287d9ca1df4SBarry Smith const MatScalar *aa,*aa_j; 128849b5e25fSSatish Balay 128949b5e25fSSatish Balay PetscFunctionBegin; 1290d0f46423SBarry Smith bs = A->rmap->bs; 1291e32f2f54SBarry Smith if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 129282799104SHong Zhang 129349b5e25fSSatish Balay aa = a->a; 12948a0c6efdSHong Zhang ambs = a->mbs; 12958a0c6efdSHong Zhang 12968a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 12978a0c6efdSHong Zhang PetscInt *diag=a->diag; 12988a0c6efdSHong Zhang aa = a->a; 12998a0c6efdSHong Zhang ambs = a->mbs; 13008a0c6efdSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 13018a0c6efdSHong Zhang for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 13028a0c6efdSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 13038a0c6efdSHong Zhang PetscFunctionReturn(0); 13048a0c6efdSHong Zhang } 13058a0c6efdSHong Zhang 130649b5e25fSSatish Balay ai = a->i; 130749b5e25fSSatish Balay aj = a->j; 130849b5e25fSSatish Balay bs2 = a->bs2; 13092dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 13101ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 131149b5e25fSSatish Balay for (i=0; i<ambs; i++) { 131249b5e25fSSatish Balay j=ai[i]; 131349b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 131449b5e25fSSatish Balay row = i*bs; 131549b5e25fSSatish Balay aa_j = aa + j*bs2; 131649b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 131749b5e25fSSatish Balay } 131849b5e25fSSatish Balay } 13191ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 132049b5e25fSSatish Balay PetscFunctionReturn(0); 132149b5e25fSSatish Balay } 132249b5e25fSSatish Balay 13234a2ae208SSatish Balay #undef __FUNCT__ 13244a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1325dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 132649b5e25fSSatish Balay { 132749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1328d9ca1df4SBarry Smith PetscScalar x; 1329d9ca1df4SBarry Smith const PetscScalar *l,*li,*ri; 133049b5e25fSSatish Balay MatScalar *aa,*v; 1331dfbe8321SBarry Smith PetscErrorCode ierr; 13325e90f9d9SHong Zhang PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1333ace3abfcSBarry Smith PetscBool flg; 133449b5e25fSSatish Balay 133549b5e25fSSatish Balay PetscFunctionBegin; 1336b3bf805bSHong Zhang if (ll != rr) { 1337b3bf805bSHong Zhang ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1338e7e72b3dSBarry Smith if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1339b3bf805bSHong Zhang } 1340b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 134149b5e25fSSatish Balay ai = a->i; 134249b5e25fSSatish Balay aj = a->j; 134349b5e25fSSatish Balay aa = a->a; 1344d0f46423SBarry Smith m = A->rmap->N; 1345d0f46423SBarry Smith bs = A->rmap->bs; 134649b5e25fSSatish Balay mbs = a->mbs; 134749b5e25fSSatish Balay bs2 = a->bs2; 134849b5e25fSSatish Balay 1349d9ca1df4SBarry Smith ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 135049b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1351e32f2f54SBarry Smith if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 135249b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 135349b5e25fSSatish Balay M = ai[i+1] - ai[i]; 135449b5e25fSSatish Balay li = l + i*bs; 135549b5e25fSSatish Balay v = aa + bs2*ai[i]; 135649b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 135749b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 13585e90f9d9SHong Zhang for (k=0; k<bs; k++) { 135949b5e25fSSatish Balay x = ri[k]; 136049b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 136149b5e25fSSatish Balay } 136249b5e25fSSatish Balay } 136349b5e25fSSatish Balay } 1364d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1365dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 136649b5e25fSSatish Balay PetscFunctionReturn(0); 136749b5e25fSSatish Balay } 136849b5e25fSSatish Balay 13694a2ae208SSatish Balay #undef __FUNCT__ 13704a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1371dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 137249b5e25fSSatish Balay { 137349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 137449b5e25fSSatish Balay 137549b5e25fSSatish Balay PetscFunctionBegin; 137649b5e25fSSatish Balay info->block_size = a->bs2; 1377ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 13786c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 137949b5e25fSSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 138049b5e25fSSatish Balay info->assemblies = A->num_ass; 13818e58a170SBarry Smith info->mallocs = A->info.mallocs; 13827adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1383d5f3da31SBarry Smith if (A->factortype) { 138449b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 138549b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 138649b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 138749b5e25fSSatish Balay } else { 138849b5e25fSSatish Balay info->fill_ratio_given = 0; 138949b5e25fSSatish Balay info->fill_ratio_needed = 0; 139049b5e25fSSatish Balay info->factor_mallocs = 0; 139149b5e25fSSatish Balay } 139249b5e25fSSatish Balay PetscFunctionReturn(0); 139349b5e25fSSatish Balay } 139449b5e25fSSatish Balay 139549b5e25fSSatish Balay 13964a2ae208SSatish Balay #undef __FUNCT__ 13974a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1398dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 139949b5e25fSSatish Balay { 140049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1401dfbe8321SBarry Smith PetscErrorCode ierr; 140249b5e25fSSatish Balay 140349b5e25fSSatish Balay PetscFunctionBegin; 140449b5e25fSSatish Balay ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 140549b5e25fSSatish Balay PetscFunctionReturn(0); 140649b5e25fSSatish Balay } 1407dc354874SHong Zhang 14084a2ae208SSatish Balay #undef __FUNCT__ 1409985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1410985db425SBarry Smith /* 1411985db425SBarry Smith This code does not work since it only checks the upper triangular part of 1412985db425SBarry Smith the matrix. Hence it is not listed in the function table. 1413985db425SBarry Smith */ 1414985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1415dc354874SHong Zhang { 1416dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1417dfbe8321SBarry Smith PetscErrorCode ierr; 1418d9ca1df4SBarry Smith PetscInt i,j,n,row,col,bs,mbs; 1419d9ca1df4SBarry Smith const PetscInt *ai,*aj; 1420c3fca9a7SHong Zhang PetscReal atmp; 1421d9ca1df4SBarry Smith const MatScalar *aa; 1422985db425SBarry Smith PetscScalar *x; 142313f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1424dc354874SHong Zhang 1425dc354874SHong Zhang PetscFunctionBegin; 1426e32f2f54SBarry Smith if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1427e32f2f54SBarry Smith if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1428d0f46423SBarry Smith bs = A->rmap->bs; 1429dc354874SHong Zhang aa = a->a; 1430dc354874SHong Zhang ai = a->i; 1431dc354874SHong Zhang aj = a->j; 143244117c81SHong Zhang mbs = a->mbs; 1433dc354874SHong Zhang 1434985db425SBarry Smith ierr = VecSet(v,0.0);CHKERRQ(ierr); 14351ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1436dc354874SHong Zhang ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1437e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 143844117c81SHong Zhang for (i=0; i<mbs; i++) { 1439d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1440d0f6400bSHong Zhang brow = bs*i; 144144117c81SHong Zhang for (j=0; j<ncols; j++) { 1442d0f6400bSHong Zhang bcol = bs*(*aj); 144344117c81SHong Zhang for (kcol=0; kcol<bs; kcol++) { 1444d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 144544117c81SHong Zhang for (krow=0; krow<bs; krow++) { 1446d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1447d0f6400bSHong Zhang row = brow + krow; /* row index */ 1448c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1449c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 145044117c81SHong Zhang } 145144117c81SHong Zhang } 1452d0f6400bSHong Zhang aj++; 1453dc354874SHong Zhang } 1454dc354874SHong Zhang } 14551ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1456dc354874SHong Zhang PetscFunctionReturn(0); 1457dc354874SHong Zhang } 1458