149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h> 6c6db04a5SJed Brown #include <petscblaslapack.h> 749b5e25fSSatish Balay 84a2ae208SSatish Balay #undef __FUNCT__ 94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 1149b5e25fSSatish Balay { 125eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 136849ba73SBarry Smith PetscErrorCode ierr; 145d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 155d0c19d7SBarry Smith const PetscInt *idx; 16db41cccfSHong Zhang PetscBT table_out,table_in; 17d94109b8SHong Zhang 18d94109b8SHong Zhang PetscFunctionBegin; 19e32f2f54SBarry Smith if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20d94109b8SHong Zhang mbs = a->mbs; 21d94109b8SHong Zhang ai = a->i; 22d94109b8SHong Zhang aj = a->j; 23d0f46423SBarry Smith bs = A->rmap->bs; 2453b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25854ce69bSBarry Smith ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 2753b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28d94109b8SHong Zhang 29d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 30d94109b8SHong Zhang isz = 0; 31db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32d94109b8SHong Zhang 33d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 34d94109b8SHong Zhang ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35d94109b8SHong Zhang ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36d94109b8SHong Zhang 37db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38dbe03f88SHong Zhang bcol_max = 0; 39d94109b8SHong Zhang for (j=0; j<n; ++j) { 40d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 41e32f2f54SBarry Smith if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) { 43dbe03f88SHong Zhang nidx[isz++] = brow; 44dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 45dbe03f88SHong Zhang } 46d94109b8SHong Zhang } 47d94109b8SHong Zhang ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 486bf464f9SBarry Smith ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49d94109b8SHong Zhang 50d94109b8SHong Zhang k = 0; 51d94109b8SHong Zhang for (j=0; j<ov; j++) { /* for each overlap */ 52db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54db41cccfSHong Zhang for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55d94109b8SHong Zhang 56d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 57d94109b8SHong Zhang for (brow=0; brow<mbs; brow++) { 58d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 59db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60d94109b8SHong Zhang for (l = start; l<end; l++) { 61d94109b8SHong Zhang bcol = aj[l]; 62d7b97159SHong Zhang if (!PetscBTLookupSet(table_out,bcol)) { 63d7b97159SHong Zhang nidx[isz++] = bcol; 64d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 65d7b97159SHong Zhang } 66d94109b8SHong Zhang } 67d94109b8SHong Zhang k++; 68d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70d94109b8SHong Zhang for (l = start; l<end; l++) { 71d94109b8SHong Zhang bcol = aj[l]; 72dbe03f88SHong Zhang if (bcol > bcol_max) break; 73db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)) { 7426fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } 79d94109b8SHong Zhang } 80d94109b8SHong Zhang } /* for each overlap */ 81d94109b8SHong Zhang 82d94109b8SHong Zhang /* expand the Index Set */ 83d94109b8SHong Zhang for (j=0; j<isz; j++) { 8426fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85d94109b8SHong Zhang } 8670b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87d94109b8SHong Zhang } 8894bacf5dSBarry Smith ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89d94109b8SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 90d94109b8SHong Zhang ierr = PetscFree(nidx2);CHKERRQ(ierr); 9194bacf5dSBarry Smith ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 925eee224dSHong Zhang PetscFunctionReturn(0); 9349b5e25fSSatish Balay } 9449b5e25fSSatish Balay 95847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 96847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 974a2ae208SSatish Balay #undef __FUNCT__ 98847daeccSHong Zhang #define __FUNCT__ "MatSeqSBAIJZeroOps_Private" 99847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 10049b5e25fSSatish Balay { 1016849ba73SBarry Smith PetscErrorCode ierr; 10249b5e25fSSatish Balay 10349b5e25fSSatish Balay PetscFunctionBegin; 104847daeccSHong Zhang ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 105847daeccSHong Zhang Bseq->ops->mult = 0; 106847daeccSHong Zhang Bseq->ops->multadd = 0; 107847daeccSHong Zhang Bseq->ops->multtranspose = 0; 108847daeccSHong Zhang Bseq->ops->multtransposeadd = 0; 109847daeccSHong Zhang Bseq->ops->lufactor = 0; 110847daeccSHong Zhang Bseq->ops->choleskyfactor = 0; 111847daeccSHong Zhang Bseq->ops->lufactorsymbolic = 0; 112847daeccSHong Zhang Bseq->ops->choleskyfactorsymbolic = 0; 113847daeccSHong Zhang Bseq->ops->getinertia = 0; 11449b5e25fSSatish Balay PetscFunctionReturn(0); 11549b5e25fSSatish Balay } 11649b5e25fSSatish Balay 117*e50a8114SHong Zhang /* same as MatGetSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 118*e50a8114SHong Zhang #undef __FUNCT__ 119*e50a8114SHong Zhang #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 120*e50a8114SHong Zhang PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 121*e50a8114SHong Zhang { 122*e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 123*e50a8114SHong Zhang PetscErrorCode ierr; 124*e50a8114SHong Zhang PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 125*e50a8114SHong Zhang PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 126*e50a8114SHong Zhang const PetscInt *irow,*icol; 127*e50a8114SHong Zhang PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 128*e50a8114SHong Zhang PetscInt *aj = a->j,*ai = a->i; 129*e50a8114SHong Zhang MatScalar *mat_a; 130*e50a8114SHong Zhang Mat C; 131*e50a8114SHong Zhang PetscBool flag; 132*e50a8114SHong Zhang 133*e50a8114SHong Zhang PetscFunctionBegin; 134*e50a8114SHong Zhang 135*e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 136*e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 137*e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 138*e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 139*e50a8114SHong Zhang 140*e50a8114SHong Zhang ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 141*e50a8114SHong Zhang ssmap = smap; 142*e50a8114SHong Zhang ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 143*e50a8114SHong Zhang for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 144*e50a8114SHong Zhang /* determine lens of each row */ 145*e50a8114SHong Zhang for (i=0; i<nrows; i++) { 146*e50a8114SHong Zhang kstart = ai[irow[i]]; 147*e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 148*e50a8114SHong Zhang lens[i] = 0; 149*e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 150*e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 151*e50a8114SHong Zhang } 152*e50a8114SHong Zhang } 153*e50a8114SHong Zhang /* Create and fill new matrix */ 154*e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 155*e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)((*B)->data); 156*e50a8114SHong Zhang 157*e50a8114SHong Zhang if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 158*e50a8114SHong Zhang ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 159*e50a8114SHong Zhang if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 160*e50a8114SHong Zhang ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 161*e50a8114SHong Zhang C = *B; 162*e50a8114SHong Zhang } else { 163*e50a8114SHong Zhang ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 164*e50a8114SHong Zhang ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 165*e50a8114SHong Zhang ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 166*e50a8114SHong Zhang ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 167*e50a8114SHong Zhang } 168*e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)(C->data); 169*e50a8114SHong Zhang for (i=0; i<nrows; i++) { 170*e50a8114SHong Zhang row = irow[i]; 171*e50a8114SHong Zhang kstart = ai[row]; 172*e50a8114SHong Zhang kend = kstart + a->ilen[row]; 173*e50a8114SHong Zhang mat_i = c->i[i]; 174*e50a8114SHong Zhang mat_j = c->j + mat_i; 175*e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 176*e50a8114SHong Zhang mat_ilen = c->ilen + i; 177*e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 178*e50a8114SHong Zhang if ((tcol=ssmap[a->j[k]])) { 179*e50a8114SHong Zhang *mat_j++ = tcol - 1; 180*e50a8114SHong Zhang ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 181*e50a8114SHong Zhang mat_a += bs2; 182*e50a8114SHong Zhang (*mat_ilen)++; 183*e50a8114SHong Zhang } 184*e50a8114SHong Zhang } 185*e50a8114SHong Zhang } 186*e50a8114SHong Zhang /* sort */ 187*e50a8114SHong Zhang { 188*e50a8114SHong Zhang MatScalar *work; 189*e50a8114SHong Zhang 190*e50a8114SHong Zhang ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 191*e50a8114SHong Zhang for (i=0; i<nrows; i++) { 192*e50a8114SHong Zhang PetscInt ilen; 193*e50a8114SHong Zhang mat_i = c->i[i]; 194*e50a8114SHong Zhang mat_j = c->j + mat_i; 195*e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 196*e50a8114SHong Zhang ilen = c->ilen[i]; 197*e50a8114SHong Zhang ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 198*e50a8114SHong Zhang } 199*e50a8114SHong Zhang ierr = PetscFree(work);CHKERRQ(ierr); 200*e50a8114SHong Zhang } 201*e50a8114SHong Zhang 202*e50a8114SHong Zhang /* Free work space */ 203*e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 204*e50a8114SHong Zhang ierr = PetscFree(smap);CHKERRQ(ierr); 205*e50a8114SHong Zhang ierr = PetscFree(lens);CHKERRQ(ierr); 206*e50a8114SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 207*e50a8114SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 208*e50a8114SHong Zhang 209*e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210*e50a8114SHong Zhang *B = C; 211*e50a8114SHong Zhang PetscFunctionReturn(0); 212*e50a8114SHong Zhang } 213*e50a8114SHong Zhang 2144a2ae208SSatish Balay #undef __FUNCT__ 2154a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 2164aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 21749b5e25fSSatish Balay { 218*e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 219*e50a8114SHong Zhang IS is1,is2; 2206849ba73SBarry Smith PetscErrorCode ierr; 221*e50a8114SHong Zhang PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 222*e50a8114SHong Zhang const PetscInt *irow,*icol; 22349b5e25fSSatish Balay 22449b5e25fSSatish Balay PetscFunctionBegin; 225*e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 226*e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 227*e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 228*e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 229*e50a8114SHong Zhang 230*e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 231*e50a8114SHong Zhang and form the IS with compressed IS */ 232*e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs,a->nbs); 233*e50a8114SHong Zhang ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 234*e50a8114SHong Zhang ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 235*e50a8114SHong Zhang for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 236*e50a8114SHong Zhang for (i=0; i<a->mbs; i++) { 237*e50a8114SHong Zhang if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 238*e50a8114SHong Zhang } 239*e50a8114SHong Zhang count = 0; 240*e50a8114SHong Zhang for (i=0; i<nrows; i++) { 241*e50a8114SHong Zhang PetscInt j = irow[i] / bs; 242*e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 243*e50a8114SHong Zhang } 244*e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 245*e50a8114SHong Zhang 246*e50a8114SHong Zhang ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 247*e50a8114SHong Zhang for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 248*e50a8114SHong Zhang for (i=0; i<a->nbs; i++) { 249*e50a8114SHong Zhang if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 250*e50a8114SHong Zhang } 251*e50a8114SHong Zhang count = 0; 252*e50a8114SHong Zhang for (i=0; i<ncols; i++) { 253*e50a8114SHong Zhang PetscInt j = icol[i] / bs; 254*e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 255*e50a8114SHong Zhang } 256*e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 257*e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 258*e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 259*e50a8114SHong Zhang ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 260*e50a8114SHong Zhang 261*e50a8114SHong Zhang ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 262*e50a8114SHong Zhang ierr = ISDestroy(&is1);CHKERRQ(ierr); 263*e50a8114SHong Zhang ierr = ISDestroy(&is2);CHKERRQ(ierr); 264847daeccSHong Zhang 2658f46ffcaSHong Zhang if (isrow != iscol) { 2668f46ffcaSHong Zhang PetscBool isequal; 2678f46ffcaSHong Zhang ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 268847daeccSHong Zhang if (!isequal) { 269847daeccSHong Zhang ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 2708f46ffcaSHong Zhang } 27149b5e25fSSatish Balay } 27249b5e25fSSatish Balay PetscFunctionReturn(0); 27349b5e25fSSatish Balay } 27449b5e25fSSatish Balay 2754a2ae208SSatish Balay #undef __FUNCT__ 2764a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 27713f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 27849b5e25fSSatish Balay { 2796849ba73SBarry Smith PetscErrorCode ierr; 28013f74950SBarry Smith PetscInt i; 28149b5e25fSSatish Balay 28249b5e25fSSatish Balay PetscFunctionBegin; 283*e50a8114SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 284*e50a8114SHong Zhang ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 285afebec48SHong Zhang } 286*e50a8114SHong Zhang 287*e50a8114SHong Zhang for (i=0; i<n; i++) { 288*e50a8114SHong Zhang ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 28949b5e25fSSatish Balay } 29049b5e25fSSatish Balay PetscFunctionReturn(0); 29149b5e25fSSatish Balay } 29249b5e25fSSatish Balay 29349b5e25fSSatish Balay /* -------------------------------------------------------*/ 29449b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 29549b5e25fSSatish Balay /* -------------------------------------------------------*/ 29649b5e25fSSatish Balay 2974a2ae208SSatish Balay #undef __FUNCT__ 2984a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2" 299dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 30049b5e25fSSatish Balay { 30149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 302d9ca1df4SBarry Smith PetscScalar *z,x1,x2,zero=0.0; 303d9ca1df4SBarry Smith const PetscScalar *x,*xb; 304d9ca1df4SBarry Smith const MatScalar *v; 3056849ba73SBarry Smith PetscErrorCode ierr; 306d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 307d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 30898c9bda7SSatish Balay PetscInt nonzerorow=0; 30949b5e25fSSatish Balay 31049b5e25fSSatish Balay PetscFunctionBegin; 3112dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 312d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3131ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 31449b5e25fSSatish Balay 31549b5e25fSSatish Balay v = a->a; 31649b5e25fSSatish Balay xb = x; 31749b5e25fSSatish Balay 31849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 31949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 32049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 32149b5e25fSSatish Balay ib = aj + *ai; 322831a3094SHong Zhang jmin = 0; 32398c9bda7SSatish Balay nonzerorow += (n>0); 3247fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 32549b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 32649b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 327831a3094SHong Zhang v += 4; jmin++; 3287fbae186SHong Zhang } 329444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 330444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 331831a3094SHong Zhang for (j=jmin; j<n; j++) { 33249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 33349b5e25fSSatish Balay cval = ib[j]*2; 33449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 33549b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 33649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 33749b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 33849b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 33949b5e25fSSatish Balay v += 4; 34049b5e25fSSatish Balay } 34149b5e25fSSatish Balay xb +=2; ai++; 34249b5e25fSSatish Balay } 34349b5e25fSSatish Balay 344d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3451ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 346dc0b31edSSatish Balay ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 34749b5e25fSSatish Balay PetscFunctionReturn(0); 34849b5e25fSSatish Balay } 34949b5e25fSSatish Balay 3504a2ae208SSatish Balay #undef __FUNCT__ 3514a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3" 352dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 35349b5e25fSSatish Balay { 35449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 355d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,zero=0.0; 356d9ca1df4SBarry Smith const PetscScalar *x,*xb; 357d9ca1df4SBarry Smith const MatScalar *v; 3586849ba73SBarry Smith PetscErrorCode ierr; 359d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 360d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 36198c9bda7SSatish Balay PetscInt nonzerorow=0; 36249b5e25fSSatish Balay 36349b5e25fSSatish Balay PetscFunctionBegin; 3642dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 365d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3661ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 36749b5e25fSSatish Balay 36849b5e25fSSatish Balay v = a->a; 36949b5e25fSSatish Balay xb = x; 37049b5e25fSSatish Balay 37149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 37249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 37349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 37449b5e25fSSatish Balay ib = aj + *ai; 375831a3094SHong Zhang jmin = 0; 37698c9bda7SSatish Balay nonzerorow += (n>0); 3777fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 37849b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 37949b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 38049b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 381831a3094SHong Zhang v += 9; jmin++; 3827fbae186SHong Zhang } 383444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 384444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 385831a3094SHong Zhang for (j=jmin; j<n; j++) { 38649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 38749b5e25fSSatish Balay cval = ib[j]*3; 38849b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 38949b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 39049b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 39149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 39249b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 39349b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 39449b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 39549b5e25fSSatish Balay v += 9; 39649b5e25fSSatish Balay } 39749b5e25fSSatish Balay xb +=3; ai++; 39849b5e25fSSatish Balay } 39949b5e25fSSatish Balay 400d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4011ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 402dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 40349b5e25fSSatish Balay PetscFunctionReturn(0); 40449b5e25fSSatish Balay } 40549b5e25fSSatish Balay 4064a2ae208SSatish Balay #undef __FUNCT__ 4074a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4" 408dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 40949b5e25fSSatish Balay { 41049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 411d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,zero=0.0; 412d9ca1df4SBarry Smith const PetscScalar *x,*xb; 413d9ca1df4SBarry Smith const MatScalar *v; 4146849ba73SBarry Smith PetscErrorCode ierr; 415d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 416d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 41798c9bda7SSatish Balay PetscInt nonzerorow = 0; 41849b5e25fSSatish Balay 41949b5e25fSSatish Balay PetscFunctionBegin; 4202dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 421d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4221ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 42349b5e25fSSatish Balay 42449b5e25fSSatish Balay v = a->a; 42549b5e25fSSatish Balay xb = x; 42649b5e25fSSatish Balay 42749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 42849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 42949b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 43049b5e25fSSatish Balay ib = aj + *ai; 431831a3094SHong Zhang jmin = 0; 43298c9bda7SSatish Balay nonzerorow += (n>0); 4337fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 43449b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 43549b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 43649b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 43749b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 438831a3094SHong Zhang v += 16; jmin++; 4397fbae186SHong Zhang } 440444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 441444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 442831a3094SHong Zhang for (j=jmin; j<n; j++) { 44349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 44449b5e25fSSatish Balay cval = ib[j]*4; 44549b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 44649b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 44749b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 44849b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 44949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 45049b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 45149b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 45249b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 45349b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 45449b5e25fSSatish Balay v += 16; 45549b5e25fSSatish Balay } 45649b5e25fSSatish Balay xb +=4; ai++; 45749b5e25fSSatish Balay } 45849b5e25fSSatish Balay 459d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4601ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 461dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 46249b5e25fSSatish Balay PetscFunctionReturn(0); 46349b5e25fSSatish Balay } 46449b5e25fSSatish Balay 4654a2ae208SSatish Balay #undef __FUNCT__ 4664a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5" 467dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 46849b5e25fSSatish Balay { 46949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 470d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 471d9ca1df4SBarry Smith const PetscScalar *x,*xb; 472d9ca1df4SBarry Smith const MatScalar *v; 4736849ba73SBarry Smith PetscErrorCode ierr; 474d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 475d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 47698c9bda7SSatish Balay PetscInt nonzerorow=0; 47749b5e25fSSatish Balay 47849b5e25fSSatish Balay PetscFunctionBegin; 4792dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 480d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4811ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 48249b5e25fSSatish Balay 48349b5e25fSSatish Balay v = a->a; 48449b5e25fSSatish Balay xb = x; 48549b5e25fSSatish Balay 48649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 48749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 48849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 48949b5e25fSSatish Balay ib = aj + *ai; 490831a3094SHong Zhang jmin = 0; 49198c9bda7SSatish Balay nonzerorow += (n>0); 4927fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 49349b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 49449b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 49549b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 49649b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 49749b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 498831a3094SHong Zhang v += 25; jmin++; 4997fbae186SHong Zhang } 500444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 501444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 502831a3094SHong Zhang for (j=jmin; j<n; j++) { 50349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 50449b5e25fSSatish Balay cval = ib[j]*5; 50549b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 50649b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 50749b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 50849b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 50949b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 51049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 51149b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 51249b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 51349b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 51449b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 51549b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 51649b5e25fSSatish Balay v += 25; 51749b5e25fSSatish Balay } 51849b5e25fSSatish Balay xb +=5; ai++; 51949b5e25fSSatish Balay } 52049b5e25fSSatish Balay 521d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5221ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 523dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 52449b5e25fSSatish Balay PetscFunctionReturn(0); 52549b5e25fSSatish Balay } 52649b5e25fSSatish Balay 52749b5e25fSSatish Balay 5284a2ae208SSatish Balay #undef __FUNCT__ 5294a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6" 530dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 53149b5e25fSSatish Balay { 53249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 533d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 534d9ca1df4SBarry Smith const PetscScalar *x,*xb; 535d9ca1df4SBarry Smith const MatScalar *v; 5366849ba73SBarry Smith PetscErrorCode ierr; 537d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 538d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 53998c9bda7SSatish Balay PetscInt nonzerorow=0; 54049b5e25fSSatish Balay 54149b5e25fSSatish Balay PetscFunctionBegin; 5422dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 543d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5441ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 54549b5e25fSSatish Balay 54649b5e25fSSatish Balay v = a->a; 54749b5e25fSSatish Balay xb = x; 54849b5e25fSSatish Balay 54949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 55049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 55149b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 55249b5e25fSSatish Balay ib = aj + *ai; 553831a3094SHong Zhang jmin = 0; 55498c9bda7SSatish Balay nonzerorow += (n>0); 5557fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 55649b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 55749b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 55849b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 55949b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 56049b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 56149b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 562831a3094SHong Zhang v += 36; jmin++; 5637fbae186SHong Zhang } 564444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 565444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 566831a3094SHong Zhang for (j=jmin; j<n; j++) { 56749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 56849b5e25fSSatish Balay cval = ib[j]*6; 56949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 57049b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 57149b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 57249b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 57349b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 57449b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 57549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 57649b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 57749b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 57849b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 57949b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 58049b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 58149b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 58249b5e25fSSatish Balay v += 36; 58349b5e25fSSatish Balay } 58449b5e25fSSatish Balay xb +=6; ai++; 58549b5e25fSSatish Balay } 58649b5e25fSSatish Balay 587d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5881ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 589dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 59049b5e25fSSatish Balay PetscFunctionReturn(0); 59149b5e25fSSatish Balay } 5924a2ae208SSatish Balay #undef __FUNCT__ 5934a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7" 594dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 59549b5e25fSSatish Balay { 59649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 597d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 598d9ca1df4SBarry Smith const PetscScalar *x,*xb; 599d9ca1df4SBarry Smith const MatScalar *v; 6006849ba73SBarry Smith PetscErrorCode ierr; 601d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 602d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 60398c9bda7SSatish Balay PetscInt nonzerorow=0; 60449b5e25fSSatish Balay 60549b5e25fSSatish Balay PetscFunctionBegin; 6062dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 607d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 6081ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 60949b5e25fSSatish Balay 61049b5e25fSSatish Balay v = a->a; 61149b5e25fSSatish Balay xb = x; 61249b5e25fSSatish Balay 61349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 61449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 61549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 61649b5e25fSSatish Balay ib = aj + *ai; 617831a3094SHong Zhang jmin = 0; 61898c9bda7SSatish Balay nonzerorow += (n>0); 6197fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 62049b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 62149b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 62249b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 62349b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 62449b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 62549b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 62649b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 627831a3094SHong Zhang v += 49; jmin++; 6287fbae186SHong Zhang } 629444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 630444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 631831a3094SHong Zhang for (j=jmin; j<n; j++) { 63249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 63349b5e25fSSatish Balay cval = ib[j]*7; 63449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 63549b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 63649b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 63749b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 63849b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 63949b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 64049b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 64149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 64249b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 64349b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 64449b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 64549b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 64649b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 64749b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 64849b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 64949b5e25fSSatish Balay v += 49; 65049b5e25fSSatish Balay } 65149b5e25fSSatish Balay xb +=7; ai++; 65249b5e25fSSatish Balay } 653d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 6541ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 655dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 65649b5e25fSSatish Balay PetscFunctionReturn(0); 65749b5e25fSSatish Balay } 65849b5e25fSSatish Balay 65949b5e25fSSatish Balay /* 66049b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 66149b5e25fSSatish Balay */ 6624a2ae208SSatish Balay #undef __FUNCT__ 6634a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N" 664dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 66549b5e25fSSatish Balay { 66649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 667d9ca1df4SBarry Smith PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 668d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 669d9ca1df4SBarry Smith const MatScalar *v; 670dfbe8321SBarry Smith PetscErrorCode ierr; 671d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 672d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 67398c9bda7SSatish Balay PetscInt nonzerorow=0; 67449b5e25fSSatish Balay 67549b5e25fSSatish Balay PetscFunctionBegin; 6762dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 677d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 6781ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 67949b5e25fSSatish Balay 68049b5e25fSSatish Balay aj = a->j; 68149b5e25fSSatish Balay v = a->a; 68249b5e25fSSatish Balay ii = a->i; 68349b5e25fSSatish Balay 68449b5e25fSSatish Balay if (!a->mult_work) { 685854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 68649b5e25fSSatish Balay } 68749b5e25fSSatish Balay work = a->mult_work; 68849b5e25fSSatish Balay 68949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 69049b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 69149b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 69298c9bda7SSatish Balay nonzerorow += (n>0); 69349b5e25fSSatish Balay 69449b5e25fSSatish Balay /* upper triangular part */ 69549b5e25fSSatish Balay for (j=0; j<n; j++) { 69649b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 69749b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 69849b5e25fSSatish Balay workt += bs; 69949b5e25fSSatish Balay } 70049b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 70196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 70249b5e25fSSatish Balay 70349b5e25fSSatish Balay /* strict lower triangular part */ 704831a3094SHong Zhang idx = aj+ii[0]; 705831a3094SHong Zhang if (*idx == i) { 70696b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 707831a3094SHong Zhang } 70896b9376eSHong Zhang 70949b5e25fSSatish Balay if (ncols > 0) { 71049b5e25fSSatish Balay workt = work; 71187828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 71296b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 713831a3094SHong Zhang for (j=0; j<n; j++) { 714831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 71549b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 71649b5e25fSSatish Balay workt += bs; 71749b5e25fSSatish Balay } 71849b5e25fSSatish Balay } 71949b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 72049b5e25fSSatish Balay } 72149b5e25fSSatish Balay 722d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 7231ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 724dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 72549b5e25fSSatish Balay PetscFunctionReturn(0); 72649b5e25fSSatish Balay } 72749b5e25fSSatish Balay 7284a2ae208SSatish Balay #undef __FUNCT__ 7294a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 730dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 73149b5e25fSSatish Balay { 73249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 733d9ca1df4SBarry Smith PetscScalar *z,x1; 734d9ca1df4SBarry Smith const PetscScalar *x,*xb; 735d9ca1df4SBarry Smith const MatScalar *v; 7366849ba73SBarry Smith PetscErrorCode ierr; 737d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 738d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 73998c9bda7SSatish Balay PetscInt nonzerorow=0; 74049b5e25fSSatish Balay 74149b5e25fSSatish Balay PetscFunctionBegin; 742343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 743d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7441ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 74549b5e25fSSatish Balay v = a->a; 74649b5e25fSSatish Balay xb = x; 74749b5e25fSSatish Balay 74849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 74949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 75049b5e25fSSatish Balay x1 = xb[0]; 75149b5e25fSSatish Balay ib = aj + *ai; 752831a3094SHong Zhang jmin = 0; 75398c9bda7SSatish Balay nonzerorow += (n>0); 754831a3094SHong Zhang if (*ib == i) { /* (diag of A)*x */ 755831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 756831a3094SHong Zhang } 757831a3094SHong Zhang for (j=jmin; j<n; j++) { 75849b5e25fSSatish Balay cval = *ib; 75949b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 76049b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 76149b5e25fSSatish Balay } 76249b5e25fSSatish Balay xb++; ai++; 76349b5e25fSSatish Balay } 76449b5e25fSSatish Balay 765d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 766bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 76749b5e25fSSatish Balay 768dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 76949b5e25fSSatish Balay PetscFunctionReturn(0); 77049b5e25fSSatish Balay } 77149b5e25fSSatish Balay 7724a2ae208SSatish Balay #undef __FUNCT__ 7734a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 774dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 77549b5e25fSSatish Balay { 77649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 777d9ca1df4SBarry Smith PetscScalar *z,x1,x2; 778d9ca1df4SBarry Smith const PetscScalar *x,*xb; 779d9ca1df4SBarry Smith const MatScalar *v; 7806849ba73SBarry Smith PetscErrorCode ierr; 781d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 782d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 78398c9bda7SSatish Balay PetscInt nonzerorow=0; 78449b5e25fSSatish Balay 78549b5e25fSSatish Balay PetscFunctionBegin; 786343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 787d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7881ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 78949b5e25fSSatish Balay 79049b5e25fSSatish Balay v = a->a; 79149b5e25fSSatish Balay xb = x; 79249b5e25fSSatish Balay 79349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 79449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 79549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 79649b5e25fSSatish Balay ib = aj + *ai; 797831a3094SHong Zhang jmin = 0; 79898c9bda7SSatish Balay nonzerorow += (n>0); 7997fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 80049b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 80149b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 802831a3094SHong Zhang v += 4; jmin++; 8037fbae186SHong Zhang } 804444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 805444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 806831a3094SHong Zhang for (j=jmin; j<n; j++) { 80749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 80849b5e25fSSatish Balay cval = ib[j]*2; 80949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 81049b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 81149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 81249b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 81349b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 81449b5e25fSSatish Balay v += 4; 81549b5e25fSSatish Balay } 81649b5e25fSSatish Balay xb +=2; ai++; 81749b5e25fSSatish Balay } 818d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 819bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 82049b5e25fSSatish Balay 821dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 82249b5e25fSSatish Balay PetscFunctionReturn(0); 82349b5e25fSSatish Balay } 82449b5e25fSSatish Balay 8254a2ae208SSatish Balay #undef __FUNCT__ 8264a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 827dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 82849b5e25fSSatish Balay { 82949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 830d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3; 831d9ca1df4SBarry Smith const PetscScalar *x,*xb; 832d9ca1df4SBarry Smith const MatScalar *v; 8336849ba73SBarry Smith PetscErrorCode ierr; 834d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 835d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 83698c9bda7SSatish Balay PetscInt nonzerorow=0; 83749b5e25fSSatish Balay 83849b5e25fSSatish Balay PetscFunctionBegin; 839343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 840d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8411ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 84249b5e25fSSatish Balay 84349b5e25fSSatish Balay v = a->a; 84449b5e25fSSatish Balay xb = x; 84549b5e25fSSatish Balay 84649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 84749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 84849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 84949b5e25fSSatish Balay ib = aj + *ai; 850831a3094SHong Zhang jmin = 0; 85198c9bda7SSatish Balay nonzerorow += (n>0); 8527fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 85349b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 85449b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 85549b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 856831a3094SHong Zhang v += 9; jmin++; 8577fbae186SHong Zhang } 858444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 859444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 860831a3094SHong Zhang for (j=jmin; j<n; j++) { 86149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 86249b5e25fSSatish Balay cval = ib[j]*3; 86349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 86449b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 86549b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 86649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 86749b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 86849b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 86949b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 87049b5e25fSSatish Balay v += 9; 87149b5e25fSSatish Balay } 87249b5e25fSSatish Balay xb +=3; ai++; 87349b5e25fSSatish Balay } 87449b5e25fSSatish Balay 875d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 876bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 87749b5e25fSSatish Balay 878dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 87949b5e25fSSatish Balay PetscFunctionReturn(0); 88049b5e25fSSatish Balay } 88149b5e25fSSatish Balay 8824a2ae208SSatish Balay #undef __FUNCT__ 8834a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 884dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 88549b5e25fSSatish Balay { 88649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 887d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4; 888d9ca1df4SBarry Smith const PetscScalar *x,*xb; 889d9ca1df4SBarry Smith const MatScalar *v; 8906849ba73SBarry Smith PetscErrorCode ierr; 891d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 892d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 89398c9bda7SSatish Balay PetscInt nonzerorow=0; 89449b5e25fSSatish Balay 89549b5e25fSSatish Balay PetscFunctionBegin; 896343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 897d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8981ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 89949b5e25fSSatish Balay 90049b5e25fSSatish Balay v = a->a; 90149b5e25fSSatish Balay xb = x; 90249b5e25fSSatish Balay 90349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 90449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 90549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 90649b5e25fSSatish Balay ib = aj + *ai; 907831a3094SHong Zhang jmin = 0; 90898c9bda7SSatish Balay nonzerorow += (n>0); 9097fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 91049b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 91149b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 91249b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 91349b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 914831a3094SHong Zhang v += 16; jmin++; 9157fbae186SHong Zhang } 916444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 917444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 918831a3094SHong Zhang for (j=jmin; j<n; j++) { 91949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 92049b5e25fSSatish Balay cval = ib[j]*4; 92149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 92249b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 92349b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 92449b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 92549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 92649b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 92749b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 92849b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 92949b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 93049b5e25fSSatish Balay v += 16; 93149b5e25fSSatish Balay } 93249b5e25fSSatish Balay xb +=4; ai++; 93349b5e25fSSatish Balay } 93449b5e25fSSatish Balay 935d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 936bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 93749b5e25fSSatish Balay 938dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 93949b5e25fSSatish Balay PetscFunctionReturn(0); 94049b5e25fSSatish Balay } 94149b5e25fSSatish Balay 9424a2ae208SSatish Balay #undef __FUNCT__ 9434a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 944dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 94549b5e25fSSatish Balay { 94649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 947d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5; 948d9ca1df4SBarry Smith const PetscScalar *x,*xb; 949d9ca1df4SBarry Smith const MatScalar *v; 9506849ba73SBarry Smith PetscErrorCode ierr; 951d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 952d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 95398c9bda7SSatish Balay PetscInt nonzerorow=0; 95449b5e25fSSatish Balay 95549b5e25fSSatish Balay PetscFunctionBegin; 956343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 957d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9581ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 95949b5e25fSSatish Balay 96049b5e25fSSatish Balay v = a->a; 96149b5e25fSSatish Balay xb = x; 96249b5e25fSSatish Balay 96349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 96449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 96549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 96649b5e25fSSatish Balay ib = aj + *ai; 967831a3094SHong Zhang jmin = 0; 96898c9bda7SSatish Balay nonzerorow += (n>0); 9697fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 97049b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 97149b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 97249b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 97349b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 97449b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 975831a3094SHong Zhang v += 25; jmin++; 9767fbae186SHong Zhang } 977444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 978444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 979831a3094SHong Zhang for (j=jmin; j<n; j++) { 98049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 98149b5e25fSSatish Balay cval = ib[j]*5; 98249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 98349b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 98449b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 98549b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 98649b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 98749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 98849b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 98949b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 99049b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 99149b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 99249b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 99349b5e25fSSatish Balay v += 25; 99449b5e25fSSatish Balay } 99549b5e25fSSatish Balay xb +=5; ai++; 99649b5e25fSSatish Balay } 99749b5e25fSSatish Balay 998d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 999bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 100049b5e25fSSatish Balay 1001dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 100249b5e25fSSatish Balay PetscFunctionReturn(0); 100349b5e25fSSatish Balay } 10044a2ae208SSatish Balay #undef __FUNCT__ 10054a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 1006dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 100749b5e25fSSatish Balay { 100849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1009d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6; 1010d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1011d9ca1df4SBarry Smith const MatScalar *v; 10126849ba73SBarry Smith PetscErrorCode ierr; 1013d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1014d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 101598c9bda7SSatish Balay PetscInt nonzerorow=0; 101649b5e25fSSatish Balay 101749b5e25fSSatish Balay PetscFunctionBegin; 1018343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1019d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10201ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 102149b5e25fSSatish Balay 102249b5e25fSSatish Balay v = a->a; 102349b5e25fSSatish Balay xb = x; 102449b5e25fSSatish Balay 102549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 102649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 102749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 102849b5e25fSSatish Balay ib = aj + *ai; 1029831a3094SHong Zhang jmin = 0; 103098c9bda7SSatish Balay nonzerorow += (n>0); 10317fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 103249b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 103349b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 103449b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 103549b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 103649b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 103749b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1038831a3094SHong Zhang v += 36; jmin++; 10397fbae186SHong Zhang } 1040444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1041444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1042831a3094SHong Zhang for (j=jmin; j<n; j++) { 104349b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 104449b5e25fSSatish Balay cval = ib[j]*6; 104549b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 104649b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 104749b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 104849b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 104949b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 105049b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 105149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 105249b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 105349b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 105449b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 105549b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 105649b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 105749b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 105849b5e25fSSatish Balay v += 36; 105949b5e25fSSatish Balay } 106049b5e25fSSatish Balay xb +=6; ai++; 106149b5e25fSSatish Balay } 106249b5e25fSSatish Balay 1063d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1064bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 106549b5e25fSSatish Balay 1066dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 106749b5e25fSSatish Balay PetscFunctionReturn(0); 106849b5e25fSSatish Balay } 106949b5e25fSSatish Balay 10704a2ae208SSatish Balay #undef __FUNCT__ 10714a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1072dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 107349b5e25fSSatish Balay { 107449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1075d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1076d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1077d9ca1df4SBarry Smith const MatScalar *v; 10786849ba73SBarry Smith PetscErrorCode ierr; 1079d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1080d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 108198c9bda7SSatish Balay PetscInt nonzerorow=0; 108249b5e25fSSatish Balay 108349b5e25fSSatish Balay PetscFunctionBegin; 1084343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1085d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10861ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 108749b5e25fSSatish Balay 108849b5e25fSSatish Balay v = a->a; 108949b5e25fSSatish Balay xb = x; 109049b5e25fSSatish Balay 109149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 109249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 109349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 109449b5e25fSSatish Balay ib = aj + *ai; 1095831a3094SHong Zhang jmin = 0; 109698c9bda7SSatish Balay nonzerorow += (n>0); 10977fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 109849b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 109949b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 110049b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 110149b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 110249b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 110349b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 110449b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1105831a3094SHong Zhang v += 49; jmin++; 11067fbae186SHong Zhang } 1107444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1108444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1109831a3094SHong Zhang for (j=jmin; j<n; j++) { 111049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 111149b5e25fSSatish Balay cval = ib[j]*7; 111249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 111349b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 111449b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 111549b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 111649b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 111749b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 111849b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 111949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 112049b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 112149b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 112249b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 112349b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 112449b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 112549b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 112649b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 112749b5e25fSSatish Balay v += 49; 112849b5e25fSSatish Balay } 112949b5e25fSSatish Balay xb +=7; ai++; 113049b5e25fSSatish Balay } 113149b5e25fSSatish Balay 1132d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1133bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 113449b5e25fSSatish Balay 1135dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 113649b5e25fSSatish Balay PetscFunctionReturn(0); 113749b5e25fSSatish Balay } 113849b5e25fSSatish Balay 11394a2ae208SSatish Balay #undef __FUNCT__ 11404a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1141dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 114249b5e25fSSatish Balay { 114349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1144d9ca1df4SBarry Smith PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1145d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 1146d9ca1df4SBarry Smith const MatScalar *v; 1147dfbe8321SBarry Smith PetscErrorCode ierr; 1148d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1149d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 115098c9bda7SSatish Balay PetscInt nonzerorow=0; 115149b5e25fSSatish Balay 115249b5e25fSSatish Balay PetscFunctionBegin; 1153343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1154d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 11551ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 115649b5e25fSSatish Balay 115749b5e25fSSatish Balay aj = a->j; 115849b5e25fSSatish Balay v = a->a; 115949b5e25fSSatish Balay ii = a->i; 116049b5e25fSSatish Balay 116149b5e25fSSatish Balay if (!a->mult_work) { 1162854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 116349b5e25fSSatish Balay } 116449b5e25fSSatish Balay work = a->mult_work; 116549b5e25fSSatish Balay 116649b5e25fSSatish Balay 116749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 116849b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 116949b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 117098c9bda7SSatish Balay nonzerorow += (n>0); 117149b5e25fSSatish Balay 117249b5e25fSSatish Balay /* upper triangular part */ 117349b5e25fSSatish Balay for (j=0; j<n; j++) { 117449b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 117549b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 117649b5e25fSSatish Balay workt += bs; 117749b5e25fSSatish Balay } 117849b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 117996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 118049b5e25fSSatish Balay 118149b5e25fSSatish Balay /* strict lower triangular part */ 1182831a3094SHong Zhang idx = aj+ii[0]; 1183831a3094SHong Zhang if (*idx == i) { 118496b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1185831a3094SHong Zhang } 118649b5e25fSSatish Balay if (ncols > 0) { 118749b5e25fSSatish Balay workt = work; 118887828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 118996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1190831a3094SHong Zhang for (j=0; j<n; j++) { 1191831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 119249b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 119349b5e25fSSatish Balay workt += bs; 119449b5e25fSSatish Balay } 119549b5e25fSSatish Balay } 119649b5e25fSSatish Balay 119749b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 119849b5e25fSSatish Balay } 119949b5e25fSSatish Balay 1200d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1201bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 120249b5e25fSSatish Balay 1203dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 120449b5e25fSSatish Balay PetscFunctionReturn(0); 120549b5e25fSSatish Balay } 120649b5e25fSSatish Balay 12074a2ae208SSatish Balay #undef __FUNCT__ 12084a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ" 1209f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 121049b5e25fSSatish Balay { 121149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1212f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1213efee365bSSatish Balay PetscErrorCode ierr; 1214c5df96a5SBarry Smith PetscBLASInt one = 1,totalnz; 121549b5e25fSSatish Balay 121649b5e25fSSatish Balay PetscFunctionBegin; 1217c5df96a5SBarry Smith ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 12188b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1219efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 122049b5e25fSSatish Balay PetscFunctionReturn(0); 122149b5e25fSSatish Balay } 122249b5e25fSSatish Balay 12234a2ae208SSatish Balay #undef __FUNCT__ 12244a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ" 1225dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 122649b5e25fSSatish Balay { 122749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1228d9ca1df4SBarry Smith const MatScalar *v = a->a; 122949b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1230d9ca1df4SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 12316849ba73SBarry Smith PetscErrorCode ierr; 1232d9ca1df4SBarry Smith const PetscInt *aj=a->j,*col; 123349b5e25fSSatish Balay 123449b5e25fSSatish Balay PetscFunctionBegin; 123549b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 123649b5e25fSSatish Balay for (k=0; k<mbs; k++) { 123749b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1238831a3094SHong Zhang col = aj + jmin; 1239831a3094SHong Zhang if (*col == k) { /* diagonal block */ 124049b5e25fSSatish Balay for (i=0; i<bs2; i++) { 124149b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 124249b5e25fSSatish Balay } 1243831a3094SHong Zhang jmin++; 1244831a3094SHong Zhang } 1245831a3094SHong Zhang for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 124649b5e25fSSatish Balay for (i=0; i<bs2; i++) { 124749b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 124849b5e25fSSatish Balay } 124949b5e25fSSatish Balay } 125049b5e25fSSatish Balay } 12518f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2*sum_off); 125251f70360SJed Brown ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 12530b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1254dcca6d9dSJed Brown ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 12550b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 12560b8dc8d2SHong Zhang il[0] = 0; 125749b5e25fSSatish Balay 125849b5e25fSSatish Balay *norm = 0.0; 125949b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 126049b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 126149b5e25fSSatish Balay /*-- col sum --*/ 126249b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1263831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1264831a3094SHong Zhang at step k */ 126549b5e25fSSatish Balay while (i<mbs) { 126649b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 126749b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 126849b5e25fSSatish Balay for (j=0; j<bs; j++) { 126949b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 127049b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 127149b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 127249b5e25fSSatish Balay } 127349b5e25fSSatish Balay } 127449b5e25fSSatish Balay /* update il, jl */ 1275831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1276831a3094SHong Zhang jmax = a->i[i+1]; 127749b5e25fSSatish Balay if (jmin < jmax) { 127849b5e25fSSatish Balay il[i] = jmin; 127949b5e25fSSatish Balay j = a->j[jmin]; 128049b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 128149b5e25fSSatish Balay } 128249b5e25fSSatish Balay i = nexti; 128349b5e25fSSatish Balay } 128449b5e25fSSatish Balay /*-- row sum --*/ 128549b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 128649b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 128749b5e25fSSatish Balay for (j=0; j<bs; j++) { 128849b5e25fSSatish Balay v = a->a + i*bs2 + j; 128949b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 12900b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 129149b5e25fSSatish Balay } 129249b5e25fSSatish Balay } 129349b5e25fSSatish Balay } 129449b5e25fSSatish Balay /* add k_th block row to il, jl */ 1295831a3094SHong Zhang col = aj+jmin; 1296831a3094SHong Zhang if (*col == k) jmin++; 129749b5e25fSSatish Balay if (jmin < jmax) { 129849b5e25fSSatish Balay il[k] = jmin; 12990b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 130049b5e25fSSatish Balay } 130149b5e25fSSatish Balay for (j=0; j<bs; j++) { 130249b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 130349b5e25fSSatish Balay } 130449b5e25fSSatish Balay } 130574ed9c26SBarry Smith ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 130651f70360SJed Brown ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1307f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 130849b5e25fSSatish Balay PetscFunctionReturn(0); 130949b5e25fSSatish Balay } 131049b5e25fSSatish Balay 13114a2ae208SSatish Balay #undef __FUNCT__ 13124a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ" 1313ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 131449b5e25fSSatish Balay { 131549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1316dfbe8321SBarry Smith PetscErrorCode ierr; 131749b5e25fSSatish Balay 131849b5e25fSSatish Balay PetscFunctionBegin; 131949b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1320d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1321ef511fbeSHong Zhang *flg = PETSC_FALSE; 1322ef511fbeSHong Zhang PetscFunctionReturn(0); 132349b5e25fSSatish Balay } 132449b5e25fSSatish Balay 132549b5e25fSSatish Balay /* if the a->i are the same */ 132613f74950SBarry Smith ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 132726fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 132849b5e25fSSatish Balay 132949b5e25fSSatish Balay /* if a->j are the same */ 133013f74950SBarry Smith ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 133126fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 133226fbe8dcSKarl Rupp 133349b5e25fSSatish Balay /* if a->a are the same */ 1334d0f46423SBarry Smith ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1335935af2e7SHong Zhang PetscFunctionReturn(0); 133649b5e25fSSatish Balay } 133749b5e25fSSatish Balay 13384a2ae208SSatish Balay #undef __FUNCT__ 13394a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1340dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 134149b5e25fSSatish Balay { 134249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1343dfbe8321SBarry Smith PetscErrorCode ierr; 1344d9ca1df4SBarry Smith PetscInt i,j,k,row,bs,ambs,bs2; 1345d9ca1df4SBarry Smith const PetscInt *ai,*aj; 134687828ca2SBarry Smith PetscScalar *x,zero = 0.0; 1347d9ca1df4SBarry Smith const MatScalar *aa,*aa_j; 134849b5e25fSSatish Balay 134949b5e25fSSatish Balay PetscFunctionBegin; 1350d0f46423SBarry Smith bs = A->rmap->bs; 1351e32f2f54SBarry Smith if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 135282799104SHong Zhang 135349b5e25fSSatish Balay aa = a->a; 13548a0c6efdSHong Zhang ambs = a->mbs; 13558a0c6efdSHong Zhang 13568a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13578a0c6efdSHong Zhang PetscInt *diag=a->diag; 13588a0c6efdSHong Zhang aa = a->a; 13598a0c6efdSHong Zhang ambs = a->mbs; 13608a0c6efdSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 13618a0c6efdSHong Zhang for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 13628a0c6efdSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 13638a0c6efdSHong Zhang PetscFunctionReturn(0); 13648a0c6efdSHong Zhang } 13658a0c6efdSHong Zhang 136649b5e25fSSatish Balay ai = a->i; 136749b5e25fSSatish Balay aj = a->j; 136849b5e25fSSatish Balay bs2 = a->bs2; 13692dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 13701ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 137149b5e25fSSatish Balay for (i=0; i<ambs; i++) { 137249b5e25fSSatish Balay j=ai[i]; 137349b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 137449b5e25fSSatish Balay row = i*bs; 137549b5e25fSSatish Balay aa_j = aa + j*bs2; 137649b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 137749b5e25fSSatish Balay } 137849b5e25fSSatish Balay } 13791ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 138049b5e25fSSatish Balay PetscFunctionReturn(0); 138149b5e25fSSatish Balay } 138249b5e25fSSatish Balay 13834a2ae208SSatish Balay #undef __FUNCT__ 13844a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1385dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 138649b5e25fSSatish Balay { 138749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1388d9ca1df4SBarry Smith PetscScalar x; 1389d9ca1df4SBarry Smith const PetscScalar *l,*li,*ri; 139049b5e25fSSatish Balay MatScalar *aa,*v; 1391dfbe8321SBarry Smith PetscErrorCode ierr; 13925e90f9d9SHong Zhang PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1393ace3abfcSBarry Smith PetscBool flg; 139449b5e25fSSatish Balay 139549b5e25fSSatish Balay PetscFunctionBegin; 1396b3bf805bSHong Zhang if (ll != rr) { 1397b3bf805bSHong Zhang ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1398e7e72b3dSBarry Smith if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1399b3bf805bSHong Zhang } 1400b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 140149b5e25fSSatish Balay ai = a->i; 140249b5e25fSSatish Balay aj = a->j; 140349b5e25fSSatish Balay aa = a->a; 1404d0f46423SBarry Smith m = A->rmap->N; 1405d0f46423SBarry Smith bs = A->rmap->bs; 140649b5e25fSSatish Balay mbs = a->mbs; 140749b5e25fSSatish Balay bs2 = a->bs2; 140849b5e25fSSatish Balay 1409d9ca1df4SBarry Smith ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 141049b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1411e32f2f54SBarry Smith if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 141249b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 141349b5e25fSSatish Balay M = ai[i+1] - ai[i]; 141449b5e25fSSatish Balay li = l + i*bs; 141549b5e25fSSatish Balay v = aa + bs2*ai[i]; 141649b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 141749b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 14185e90f9d9SHong Zhang for (k=0; k<bs; k++) { 141949b5e25fSSatish Balay x = ri[k]; 142049b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 142149b5e25fSSatish Balay } 142249b5e25fSSatish Balay } 142349b5e25fSSatish Balay } 1424d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1425dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 142649b5e25fSSatish Balay PetscFunctionReturn(0); 142749b5e25fSSatish Balay } 142849b5e25fSSatish Balay 14294a2ae208SSatish Balay #undef __FUNCT__ 14304a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1431dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 143249b5e25fSSatish Balay { 143349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 143449b5e25fSSatish Balay 143549b5e25fSSatish Balay PetscFunctionBegin; 143649b5e25fSSatish Balay info->block_size = a->bs2; 1437ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 14386c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 143949b5e25fSSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 144049b5e25fSSatish Balay info->assemblies = A->num_ass; 14418e58a170SBarry Smith info->mallocs = A->info.mallocs; 14427adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1443d5f3da31SBarry Smith if (A->factortype) { 144449b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 144549b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 144649b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 144749b5e25fSSatish Balay } else { 144849b5e25fSSatish Balay info->fill_ratio_given = 0; 144949b5e25fSSatish Balay info->fill_ratio_needed = 0; 145049b5e25fSSatish Balay info->factor_mallocs = 0; 145149b5e25fSSatish Balay } 145249b5e25fSSatish Balay PetscFunctionReturn(0); 145349b5e25fSSatish Balay } 145449b5e25fSSatish Balay 145549b5e25fSSatish Balay 14564a2ae208SSatish Balay #undef __FUNCT__ 14574a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1458dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 145949b5e25fSSatish Balay { 146049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1461dfbe8321SBarry Smith PetscErrorCode ierr; 146249b5e25fSSatish Balay 146349b5e25fSSatish Balay PetscFunctionBegin; 146449b5e25fSSatish Balay ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 146549b5e25fSSatish Balay PetscFunctionReturn(0); 146649b5e25fSSatish Balay } 1467dc354874SHong Zhang 14684a2ae208SSatish Balay #undef __FUNCT__ 1469985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1470985db425SBarry Smith /* 1471985db425SBarry Smith This code does not work since it only checks the upper triangular part of 1472985db425SBarry Smith the matrix. Hence it is not listed in the function table. 1473985db425SBarry Smith */ 1474985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1475dc354874SHong Zhang { 1476dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1477dfbe8321SBarry Smith PetscErrorCode ierr; 1478d9ca1df4SBarry Smith PetscInt i,j,n,row,col,bs,mbs; 1479d9ca1df4SBarry Smith const PetscInt *ai,*aj; 1480c3fca9a7SHong Zhang PetscReal atmp; 1481d9ca1df4SBarry Smith const MatScalar *aa; 1482985db425SBarry Smith PetscScalar *x; 148313f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1484dc354874SHong Zhang 1485dc354874SHong Zhang PetscFunctionBegin; 1486e32f2f54SBarry Smith if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1487e32f2f54SBarry Smith if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1488d0f46423SBarry Smith bs = A->rmap->bs; 1489dc354874SHong Zhang aa = a->a; 1490dc354874SHong Zhang ai = a->i; 1491dc354874SHong Zhang aj = a->j; 149244117c81SHong Zhang mbs = a->mbs; 1493dc354874SHong Zhang 1494985db425SBarry Smith ierr = VecSet(v,0.0);CHKERRQ(ierr); 14951ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1496dc354874SHong Zhang ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1497e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 149844117c81SHong Zhang for (i=0; i<mbs; i++) { 1499d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1500d0f6400bSHong Zhang brow = bs*i; 150144117c81SHong Zhang for (j=0; j<ncols; j++) { 1502d0f6400bSHong Zhang bcol = bs*(*aj); 150344117c81SHong Zhang for (kcol=0; kcol<bs; kcol++) { 1504d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 150544117c81SHong Zhang for (krow=0; krow<bs; krow++) { 1506d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1507d0f6400bSHong Zhang row = brow + krow; /* row index */ 1508c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1509c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 151044117c81SHong Zhang } 151144117c81SHong Zhang } 1512d0f6400bSHong Zhang aj++; 1513dc354874SHong Zhang } 1514dc354874SHong Zhang } 15151ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1516dc354874SHong Zhang PetscFunctionReturn(0); 1517dc354874SHong Zhang } 1518