149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h> 6c6db04a5SJed Brown #include <petscblaslapack.h> 749b5e25fSSatish Balay 813f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 949b5e25fSSatish Balay { 105eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 116849ba73SBarry Smith PetscErrorCode ierr; 125d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 135d0c19d7SBarry Smith const PetscInt *idx; 14db41cccfSHong Zhang PetscBT table_out,table_in; 15d94109b8SHong Zhang 16d94109b8SHong Zhang PetscFunctionBegin; 17e32f2f54SBarry Smith if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 18d94109b8SHong Zhang mbs = a->mbs; 19d94109b8SHong Zhang ai = a->i; 20d94109b8SHong Zhang aj = a->j; 21d0f46423SBarry Smith bs = A->rmap->bs; 2253b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 23854ce69bSBarry Smith ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 24854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 2553b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 26d94109b8SHong Zhang 27d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 28d94109b8SHong Zhang isz = 0; 29db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 30d94109b8SHong Zhang 31d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 32d94109b8SHong Zhang ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 33d94109b8SHong Zhang ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 34d94109b8SHong Zhang 35db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 36dbe03f88SHong Zhang bcol_max = 0; 37d94109b8SHong Zhang for (j=0; j<n; ++j) { 38d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 39e32f2f54SBarry Smith if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 40db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) { 41dbe03f88SHong Zhang nidx[isz++] = brow; 42dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 43dbe03f88SHong Zhang } 44d94109b8SHong Zhang } 45d94109b8SHong Zhang ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 466bf464f9SBarry Smith ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47d94109b8SHong Zhang 48d94109b8SHong Zhang k = 0; 49d94109b8SHong Zhang for (j=0; j<ov; j++) { /* for each overlap */ 50db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 51db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 52db41cccfSHong Zhang for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 53d94109b8SHong Zhang 54d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 55d94109b8SHong Zhang for (brow=0; brow<mbs; brow++) { 56d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 57db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58d94109b8SHong Zhang for (l = start; l<end; l++) { 59d94109b8SHong Zhang bcol = aj[l]; 60d7b97159SHong Zhang if (!PetscBTLookupSet(table_out,bcol)) { 61d7b97159SHong Zhang nidx[isz++] = bcol; 62d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 63d7b97159SHong Zhang } 64d94109b8SHong Zhang } 65d94109b8SHong Zhang k++; 66d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68d94109b8SHong Zhang for (l = start; l<end; l++) { 69d94109b8SHong Zhang bcol = aj[l]; 70dbe03f88SHong Zhang if (bcol > bcol_max) break; 71db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)) { 7226fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 73d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 74d94109b8SHong Zhang } 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } /* for each overlap */ 79d94109b8SHong Zhang 80d94109b8SHong Zhang /* expand the Index Set */ 81d94109b8SHong Zhang for (j=0; j<isz; j++) { 8226fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 83d94109b8SHong Zhang } 8470b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 85d94109b8SHong Zhang } 8694bacf5dSBarry Smith ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 87d94109b8SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 88d94109b8SHong Zhang ierr = PetscFree(nidx2);CHKERRQ(ierr); 8994bacf5dSBarry Smith ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 905eee224dSHong Zhang PetscFunctionReturn(0); 9149b5e25fSSatish Balay } 9249b5e25fSSatish Balay 93847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 94847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 95847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 9649b5e25fSSatish Balay { 976849ba73SBarry Smith PetscErrorCode ierr; 9849b5e25fSSatish Balay 9949b5e25fSSatish Balay PetscFunctionBegin; 100847daeccSHong Zhang ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 101847daeccSHong Zhang Bseq->ops->mult = 0; 102847daeccSHong Zhang Bseq->ops->multadd = 0; 103847daeccSHong Zhang Bseq->ops->multtranspose = 0; 104847daeccSHong Zhang Bseq->ops->multtransposeadd = 0; 105847daeccSHong Zhang Bseq->ops->lufactor = 0; 106847daeccSHong Zhang Bseq->ops->choleskyfactor = 0; 107847daeccSHong Zhang Bseq->ops->lufactorsymbolic = 0; 108847daeccSHong Zhang Bseq->ops->choleskyfactorsymbolic = 0; 109847daeccSHong Zhang Bseq->ops->getinertia = 0; 11049b5e25fSSatish Balay PetscFunctionReturn(0); 11149b5e25fSSatish Balay } 11249b5e25fSSatish Balay 1137dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 1147dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 115e50a8114SHong Zhang { 116e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 117e50a8114SHong Zhang PetscErrorCode ierr; 118e50a8114SHong Zhang PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 119e50a8114SHong Zhang PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 120e50a8114SHong Zhang const PetscInt *irow,*icol; 121e50a8114SHong Zhang PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 122e50a8114SHong Zhang PetscInt *aj = a->j,*ai = a->i; 123e50a8114SHong Zhang MatScalar *mat_a; 124e50a8114SHong Zhang Mat C; 125e50a8114SHong Zhang PetscBool flag; 126e50a8114SHong Zhang 127e50a8114SHong Zhang PetscFunctionBegin; 128e50a8114SHong Zhang 129e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 130e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 131e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 132e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 133e50a8114SHong Zhang 134e50a8114SHong Zhang ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 135e50a8114SHong Zhang ssmap = smap; 136e50a8114SHong Zhang ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 137e50a8114SHong Zhang for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 138e50a8114SHong Zhang /* determine lens of each row */ 139e50a8114SHong Zhang for (i=0; i<nrows; i++) { 140e50a8114SHong Zhang kstart = ai[irow[i]]; 141e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 142e50a8114SHong Zhang lens[i] = 0; 143e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 144e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 145e50a8114SHong Zhang } 146e50a8114SHong Zhang } 147e50a8114SHong Zhang /* Create and fill new matrix */ 148e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 149e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)((*B)->data); 150e50a8114SHong Zhang 151e50a8114SHong Zhang if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 152e50a8114SHong Zhang ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 153e50a8114SHong Zhang if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 154e50a8114SHong Zhang ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 155e50a8114SHong Zhang C = *B; 156e50a8114SHong Zhang } else { 157e50a8114SHong Zhang ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 158e50a8114SHong Zhang ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 159e50a8114SHong Zhang ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 160367daffbSBarry Smith ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 161e50a8114SHong Zhang } 162e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)(C->data); 163e50a8114SHong Zhang for (i=0; i<nrows; i++) { 164e50a8114SHong Zhang row = irow[i]; 165e50a8114SHong Zhang kstart = ai[row]; 166e50a8114SHong Zhang kend = kstart + a->ilen[row]; 167e50a8114SHong Zhang mat_i = c->i[i]; 168e50a8114SHong Zhang mat_j = c->j + mat_i; 169e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 170e50a8114SHong Zhang mat_ilen = c->ilen + i; 171e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 172e50a8114SHong Zhang if ((tcol=ssmap[a->j[k]])) { 173e50a8114SHong Zhang *mat_j++ = tcol - 1; 174e50a8114SHong Zhang ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 175e50a8114SHong Zhang mat_a += bs2; 176e50a8114SHong Zhang (*mat_ilen)++; 177e50a8114SHong Zhang } 178e50a8114SHong Zhang } 179e50a8114SHong Zhang } 180e50a8114SHong Zhang /* sort */ 181e50a8114SHong Zhang { 182e50a8114SHong Zhang MatScalar *work; 183e50a8114SHong Zhang 184e50a8114SHong Zhang ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 185e50a8114SHong Zhang for (i=0; i<nrows; i++) { 186e50a8114SHong Zhang PetscInt ilen; 187e50a8114SHong Zhang mat_i = c->i[i]; 188e50a8114SHong Zhang mat_j = c->j + mat_i; 189e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 190e50a8114SHong Zhang ilen = c->ilen[i]; 191e50a8114SHong Zhang ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 192e50a8114SHong Zhang } 193e50a8114SHong Zhang ierr = PetscFree(work);CHKERRQ(ierr); 194e50a8114SHong Zhang } 195e50a8114SHong Zhang 196e50a8114SHong Zhang /* Free work space */ 197e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 198e50a8114SHong Zhang ierr = PetscFree(smap);CHKERRQ(ierr); 199e50a8114SHong Zhang ierr = PetscFree(lens);CHKERRQ(ierr); 200e50a8114SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 201e50a8114SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 202e50a8114SHong Zhang 203e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 204e50a8114SHong Zhang *B = C; 205e50a8114SHong Zhang PetscFunctionReturn(0); 206e50a8114SHong Zhang } 207e50a8114SHong Zhang 2087dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 20949b5e25fSSatish Balay { 210e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 211e50a8114SHong Zhang IS is1,is2; 2126849ba73SBarry Smith PetscErrorCode ierr; 213e50a8114SHong Zhang PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 214e50a8114SHong Zhang const PetscInt *irow,*icol; 21549b5e25fSSatish Balay 21649b5e25fSSatish Balay PetscFunctionBegin; 217e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 218e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 219e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 220e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 221e50a8114SHong Zhang 222e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 223e50a8114SHong Zhang and form the IS with compressed IS */ 224e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs,a->nbs); 225e50a8114SHong Zhang ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 226e50a8114SHong Zhang ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 227e50a8114SHong Zhang for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 228e50a8114SHong Zhang for (i=0; i<a->mbs; i++) { 229e50a8114SHong Zhang if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 230e50a8114SHong Zhang } 231e50a8114SHong Zhang count = 0; 232e50a8114SHong Zhang for (i=0; i<nrows; i++) { 233e50a8114SHong Zhang PetscInt j = irow[i] / bs; 234e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 235e50a8114SHong Zhang } 236e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 237e50a8114SHong Zhang 238e50a8114SHong Zhang ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 239e50a8114SHong Zhang for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 240e50a8114SHong Zhang for (i=0; i<a->nbs; i++) { 241e50a8114SHong Zhang if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 242e50a8114SHong Zhang } 243e50a8114SHong Zhang count = 0; 244e50a8114SHong Zhang for (i=0; i<ncols; i++) { 245e50a8114SHong Zhang PetscInt j = icol[i] / bs; 246e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 247e50a8114SHong Zhang } 248e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 249e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 250e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 251e50a8114SHong Zhang ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 252e50a8114SHong Zhang 2537dae84e0SHong Zhang ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 254e50a8114SHong Zhang ierr = ISDestroy(&is1);CHKERRQ(ierr); 255e50a8114SHong Zhang ierr = ISDestroy(&is2);CHKERRQ(ierr); 256847daeccSHong Zhang 2578f46ffcaSHong Zhang if (isrow != iscol) { 2588f46ffcaSHong Zhang PetscBool isequal; 2598f46ffcaSHong Zhang ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 260847daeccSHong Zhang if (!isequal) { 261847daeccSHong Zhang ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 2628f46ffcaSHong Zhang } 26349b5e25fSSatish Balay } 26449b5e25fSSatish Balay PetscFunctionReturn(0); 26549b5e25fSSatish Balay } 26649b5e25fSSatish Balay 2677dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 26849b5e25fSSatish Balay { 2696849ba73SBarry Smith PetscErrorCode ierr; 27013f74950SBarry Smith PetscInt i; 27149b5e25fSSatish Balay 27249b5e25fSSatish Balay PetscFunctionBegin; 273e50a8114SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 274df750dc8SHong Zhang ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 275afebec48SHong Zhang } 276e50a8114SHong Zhang 277e50a8114SHong Zhang for (i=0; i<n; i++) { 2787dae84e0SHong Zhang ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 27949b5e25fSSatish Balay } 28049b5e25fSSatish Balay PetscFunctionReturn(0); 28149b5e25fSSatish Balay } 28249b5e25fSSatish Balay 28349b5e25fSSatish Balay /* -------------------------------------------------------*/ 28449b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 28549b5e25fSSatish Balay /* -------------------------------------------------------*/ 28649b5e25fSSatish Balay 287dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 28849b5e25fSSatish Balay { 28949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 290d9ca1df4SBarry Smith PetscScalar *z,x1,x2,zero=0.0; 291d9ca1df4SBarry Smith const PetscScalar *x,*xb; 292d9ca1df4SBarry Smith const MatScalar *v; 2936849ba73SBarry Smith PetscErrorCode ierr; 294d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 295d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 29698c9bda7SSatish Balay PetscInt nonzerorow=0; 29749b5e25fSSatish Balay 29849b5e25fSSatish Balay PetscFunctionBegin; 2992dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 300d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3011ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 30249b5e25fSSatish Balay 30349b5e25fSSatish Balay v = a->a; 30449b5e25fSSatish Balay xb = x; 30549b5e25fSSatish Balay 30649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 30749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 30849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 30949b5e25fSSatish Balay ib = aj + *ai; 310831a3094SHong Zhang jmin = 0; 31198c9bda7SSatish Balay nonzerorow += (n>0); 3127fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 31349b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 31449b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 315831a3094SHong Zhang v += 4; jmin++; 3167fbae186SHong Zhang } 317444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 318444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 319831a3094SHong Zhang for (j=jmin; j<n; j++) { 32049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32149b5e25fSSatish Balay cval = ib[j]*2; 32249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 32349b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 32449b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32549b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 32649b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 32749b5e25fSSatish Balay v += 4; 32849b5e25fSSatish Balay } 32949b5e25fSSatish Balay xb +=2; ai++; 33049b5e25fSSatish Balay } 33149b5e25fSSatish Balay 332d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3331ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 334dc0b31edSSatish Balay ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 33549b5e25fSSatish Balay PetscFunctionReturn(0); 33649b5e25fSSatish Balay } 33749b5e25fSSatish Balay 338dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 33949b5e25fSSatish Balay { 34049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 341d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,zero=0.0; 342d9ca1df4SBarry Smith const PetscScalar *x,*xb; 343d9ca1df4SBarry Smith const MatScalar *v; 3446849ba73SBarry Smith PetscErrorCode ierr; 345d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 346d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 34798c9bda7SSatish Balay PetscInt nonzerorow=0; 34849b5e25fSSatish Balay 34949b5e25fSSatish Balay PetscFunctionBegin; 3502dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 351d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3521ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 35349b5e25fSSatish Balay 35449b5e25fSSatish Balay v = a->a; 35549b5e25fSSatish Balay xb = x; 35649b5e25fSSatish Balay 35749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 35849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 35949b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 36049b5e25fSSatish Balay ib = aj + *ai; 361831a3094SHong Zhang jmin = 0; 36298c9bda7SSatish Balay nonzerorow += (n>0); 3637fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 36449b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 36549b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 36649b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 367831a3094SHong Zhang v += 9; jmin++; 3687fbae186SHong Zhang } 369444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 370444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 371831a3094SHong Zhang for (j=jmin; j<n; j++) { 37249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 37349b5e25fSSatish Balay cval = ib[j]*3; 37449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 37549b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 37649b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 37749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 37849b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 37949b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 38049b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 38149b5e25fSSatish Balay v += 9; 38249b5e25fSSatish Balay } 38349b5e25fSSatish Balay xb +=3; ai++; 38449b5e25fSSatish Balay } 38549b5e25fSSatish Balay 386d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3871ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 388dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 38949b5e25fSSatish Balay PetscFunctionReturn(0); 39049b5e25fSSatish Balay } 39149b5e25fSSatish Balay 392dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 39349b5e25fSSatish Balay { 39449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 395d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,zero=0.0; 396d9ca1df4SBarry Smith const PetscScalar *x,*xb; 397d9ca1df4SBarry Smith const MatScalar *v; 3986849ba73SBarry Smith PetscErrorCode ierr; 399d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 400d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 40198c9bda7SSatish Balay PetscInt nonzerorow = 0; 40249b5e25fSSatish Balay 40349b5e25fSSatish Balay PetscFunctionBegin; 4042dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 405d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4061ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 40749b5e25fSSatish Balay 40849b5e25fSSatish Balay v = a->a; 40949b5e25fSSatish Balay xb = x; 41049b5e25fSSatish Balay 41149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 41249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 41349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 41449b5e25fSSatish Balay ib = aj + *ai; 415831a3094SHong Zhang jmin = 0; 41698c9bda7SSatish Balay nonzerorow += (n>0); 4177fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 41849b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 41949b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 42049b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 42149b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 422831a3094SHong Zhang v += 16; jmin++; 4237fbae186SHong Zhang } 424444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 425444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 426831a3094SHong Zhang for (j=jmin; j<n; j++) { 42749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 42849b5e25fSSatish Balay cval = ib[j]*4; 42949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 43049b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 43149b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 43249b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 43349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 43449b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 43549b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 43649b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 43749b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 43849b5e25fSSatish Balay v += 16; 43949b5e25fSSatish Balay } 44049b5e25fSSatish Balay xb +=4; ai++; 44149b5e25fSSatish Balay } 44249b5e25fSSatish Balay 443d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4441ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 445dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 44649b5e25fSSatish Balay PetscFunctionReturn(0); 44749b5e25fSSatish Balay } 44849b5e25fSSatish Balay 449dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 45049b5e25fSSatish Balay { 45149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 452d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 453d9ca1df4SBarry Smith const PetscScalar *x,*xb; 454d9ca1df4SBarry Smith const MatScalar *v; 4556849ba73SBarry Smith PetscErrorCode ierr; 456d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 457d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 45898c9bda7SSatish Balay PetscInt nonzerorow=0; 45949b5e25fSSatish Balay 46049b5e25fSSatish Balay PetscFunctionBegin; 4612dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 462d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4631ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 46449b5e25fSSatish Balay 46549b5e25fSSatish Balay v = a->a; 46649b5e25fSSatish Balay xb = x; 46749b5e25fSSatish Balay 46849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 46949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 47049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 47149b5e25fSSatish Balay ib = aj + *ai; 472831a3094SHong Zhang jmin = 0; 47398c9bda7SSatish Balay nonzerorow += (n>0); 4747fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 47549b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 47649b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 47749b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 47849b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 47949b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 480831a3094SHong Zhang v += 25; jmin++; 4817fbae186SHong Zhang } 482444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 483444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 484831a3094SHong Zhang for (j=jmin; j<n; j++) { 48549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 48649b5e25fSSatish Balay cval = ib[j]*5; 48749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 48849b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 48949b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 49049b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 49149b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 49249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 49349b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 49449b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 49549b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 49649b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 49749b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 49849b5e25fSSatish Balay v += 25; 49949b5e25fSSatish Balay } 50049b5e25fSSatish Balay xb +=5; ai++; 50149b5e25fSSatish Balay } 50249b5e25fSSatish Balay 503d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5041ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 505dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 50649b5e25fSSatish Balay PetscFunctionReturn(0); 50749b5e25fSSatish Balay } 50849b5e25fSSatish Balay 50949b5e25fSSatish Balay 510dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 51149b5e25fSSatish Balay { 51249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 513d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 514d9ca1df4SBarry Smith const PetscScalar *x,*xb; 515d9ca1df4SBarry Smith const MatScalar *v; 5166849ba73SBarry Smith PetscErrorCode ierr; 517d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 518d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 51998c9bda7SSatish Balay PetscInt nonzerorow=0; 52049b5e25fSSatish Balay 52149b5e25fSSatish Balay PetscFunctionBegin; 5222dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 523d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5241ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 52549b5e25fSSatish Balay 52649b5e25fSSatish Balay v = a->a; 52749b5e25fSSatish Balay xb = x; 52849b5e25fSSatish Balay 52949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 53049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 53149b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 53249b5e25fSSatish Balay ib = aj + *ai; 533831a3094SHong Zhang jmin = 0; 53498c9bda7SSatish Balay nonzerorow += (n>0); 5357fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 53649b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 53749b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 53849b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 53949b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 54049b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 54149b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 542831a3094SHong Zhang v += 36; jmin++; 5437fbae186SHong Zhang } 544444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 545444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 546831a3094SHong Zhang for (j=jmin; j<n; j++) { 54749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 54849b5e25fSSatish Balay cval = ib[j]*6; 54949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 55049b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 55149b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 55249b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 55349b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 55449b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 55549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 55649b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 55749b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 55849b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 55949b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 56049b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 56149b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 56249b5e25fSSatish Balay v += 36; 56349b5e25fSSatish Balay } 56449b5e25fSSatish Balay xb +=6; ai++; 56549b5e25fSSatish Balay } 56649b5e25fSSatish Balay 567d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5681ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 569dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 57049b5e25fSSatish Balay PetscFunctionReturn(0); 57149b5e25fSSatish Balay } 572dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 57349b5e25fSSatish Balay { 57449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 575d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 576d9ca1df4SBarry Smith const PetscScalar *x,*xb; 577d9ca1df4SBarry Smith const MatScalar *v; 5786849ba73SBarry Smith PetscErrorCode ierr; 579d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 580d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 58198c9bda7SSatish Balay PetscInt nonzerorow=0; 58249b5e25fSSatish Balay 58349b5e25fSSatish Balay PetscFunctionBegin; 5842dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 585d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5861ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 58749b5e25fSSatish Balay 58849b5e25fSSatish Balay v = a->a; 58949b5e25fSSatish Balay xb = x; 59049b5e25fSSatish Balay 59149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 59249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 59349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 59449b5e25fSSatish Balay ib = aj + *ai; 595831a3094SHong Zhang jmin = 0; 59698c9bda7SSatish Balay nonzerorow += (n>0); 5977fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 59849b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 59949b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 60049b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 60149b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 60249b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 60349b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 60449b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 605831a3094SHong Zhang v += 49; jmin++; 6067fbae186SHong Zhang } 607444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 608444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 609831a3094SHong Zhang for (j=jmin; j<n; j++) { 61049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 61149b5e25fSSatish Balay cval = ib[j]*7; 61249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 61349b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 61449b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 61549b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 61649b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 61749b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 61849b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 61949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 62049b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 62149b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 62249b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 62349b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 62449b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 62549b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 62649b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 62749b5e25fSSatish Balay v += 49; 62849b5e25fSSatish Balay } 62949b5e25fSSatish Balay xb +=7; ai++; 63049b5e25fSSatish Balay } 631d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 6321ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 633dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 63449b5e25fSSatish Balay PetscFunctionReturn(0); 63549b5e25fSSatish Balay } 63649b5e25fSSatish Balay 63749b5e25fSSatish Balay /* 63849b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 63949b5e25fSSatish Balay */ 640dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 64149b5e25fSSatish Balay { 64249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 643d9ca1df4SBarry Smith PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 644d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 645d9ca1df4SBarry Smith const MatScalar *v; 646dfbe8321SBarry Smith PetscErrorCode ierr; 647d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 648d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 64998c9bda7SSatish Balay PetscInt nonzerorow=0; 65049b5e25fSSatish Balay 65149b5e25fSSatish Balay PetscFunctionBegin; 6522dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 653d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 6541ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 65549b5e25fSSatish Balay 65649b5e25fSSatish Balay aj = a->j; 65749b5e25fSSatish Balay v = a->a; 65849b5e25fSSatish Balay ii = a->i; 65949b5e25fSSatish Balay 66049b5e25fSSatish Balay if (!a->mult_work) { 661854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 66249b5e25fSSatish Balay } 66349b5e25fSSatish Balay work = a->mult_work; 66449b5e25fSSatish Balay 66549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 66649b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 66749b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 66898c9bda7SSatish Balay nonzerorow += (n>0); 66949b5e25fSSatish Balay 67049b5e25fSSatish Balay /* upper triangular part */ 67149b5e25fSSatish Balay for (j=0; j<n; j++) { 67249b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 67349b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 67449b5e25fSSatish Balay workt += bs; 67549b5e25fSSatish Balay } 67649b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 67796b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 67849b5e25fSSatish Balay 67949b5e25fSSatish Balay /* strict lower triangular part */ 680831a3094SHong Zhang idx = aj+ii[0]; 681831a3094SHong Zhang if (*idx == i) { 68296b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 683831a3094SHong Zhang } 68496b9376eSHong Zhang 68549b5e25fSSatish Balay if (ncols > 0) { 68649b5e25fSSatish Balay workt = work; 68787828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 68896b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 689831a3094SHong Zhang for (j=0; j<n; j++) { 690831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 69149b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 69249b5e25fSSatish Balay workt += bs; 69349b5e25fSSatish Balay } 69449b5e25fSSatish Balay } 69549b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 69649b5e25fSSatish Balay } 69749b5e25fSSatish Balay 698d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 6991ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 700dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 70149b5e25fSSatish Balay PetscFunctionReturn(0); 70249b5e25fSSatish Balay } 70349b5e25fSSatish Balay 704dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 70549b5e25fSSatish Balay { 70649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 707d9ca1df4SBarry Smith PetscScalar *z,x1; 708d9ca1df4SBarry Smith const PetscScalar *x,*xb; 709d9ca1df4SBarry Smith const MatScalar *v; 7106849ba73SBarry Smith PetscErrorCode ierr; 711d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 712d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 71398c9bda7SSatish Balay PetscInt nonzerorow=0; 71449b5e25fSSatish Balay 71549b5e25fSSatish Balay PetscFunctionBegin; 716343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 717d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7181ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 71949b5e25fSSatish Balay v = a->a; 72049b5e25fSSatish Balay xb = x; 72149b5e25fSSatish Balay 72249b5e25fSSatish Balay for (i=0; i<mbs; i++) { 72349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 72449b5e25fSSatish Balay x1 = xb[0]; 72549b5e25fSSatish Balay ib = aj + *ai; 726831a3094SHong Zhang jmin = 0; 72798c9bda7SSatish Balay nonzerorow += (n>0); 728831a3094SHong Zhang if (*ib == i) { /* (diag of A)*x */ 729831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 730831a3094SHong Zhang } 731831a3094SHong Zhang for (j=jmin; j<n; j++) { 73249b5e25fSSatish Balay cval = *ib; 73349b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 73449b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 73549b5e25fSSatish Balay } 73649b5e25fSSatish Balay xb++; ai++; 73749b5e25fSSatish Balay } 73849b5e25fSSatish Balay 739d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 740bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 74149b5e25fSSatish Balay 742dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 74349b5e25fSSatish Balay PetscFunctionReturn(0); 74449b5e25fSSatish Balay } 74549b5e25fSSatish Balay 746dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 74749b5e25fSSatish Balay { 74849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 749d9ca1df4SBarry Smith PetscScalar *z,x1,x2; 750d9ca1df4SBarry Smith const PetscScalar *x,*xb; 751d9ca1df4SBarry Smith const MatScalar *v; 7526849ba73SBarry Smith PetscErrorCode ierr; 753d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 754d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 75598c9bda7SSatish Balay PetscInt nonzerorow=0; 75649b5e25fSSatish Balay 75749b5e25fSSatish Balay PetscFunctionBegin; 758343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 759d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7601ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 76149b5e25fSSatish Balay 76249b5e25fSSatish Balay v = a->a; 76349b5e25fSSatish Balay xb = x; 76449b5e25fSSatish Balay 76549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 76649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 76749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 76849b5e25fSSatish Balay ib = aj + *ai; 769831a3094SHong Zhang jmin = 0; 77098c9bda7SSatish Balay nonzerorow += (n>0); 7717fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 77249b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 77349b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 774831a3094SHong Zhang v += 4; jmin++; 7757fbae186SHong Zhang } 776444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 777444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 778831a3094SHong Zhang for (j=jmin; j<n; j++) { 77949b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 78049b5e25fSSatish Balay cval = ib[j]*2; 78149b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 78249b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 78349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 78449b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 78549b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 78649b5e25fSSatish Balay v += 4; 78749b5e25fSSatish Balay } 78849b5e25fSSatish Balay xb +=2; ai++; 78949b5e25fSSatish Balay } 790d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 791bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 79249b5e25fSSatish Balay 793dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 79449b5e25fSSatish Balay PetscFunctionReturn(0); 79549b5e25fSSatish Balay } 79649b5e25fSSatish Balay 797dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 79849b5e25fSSatish Balay { 79949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 800d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3; 801d9ca1df4SBarry Smith const PetscScalar *x,*xb; 802d9ca1df4SBarry Smith const MatScalar *v; 8036849ba73SBarry Smith PetscErrorCode ierr; 804d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 805d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 80698c9bda7SSatish Balay PetscInt nonzerorow=0; 80749b5e25fSSatish Balay 80849b5e25fSSatish Balay PetscFunctionBegin; 809343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 810d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8111ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 81249b5e25fSSatish Balay 81349b5e25fSSatish Balay v = a->a; 81449b5e25fSSatish Balay xb = x; 81549b5e25fSSatish Balay 81649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 81749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 81849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 81949b5e25fSSatish Balay ib = aj + *ai; 820831a3094SHong Zhang jmin = 0; 82198c9bda7SSatish Balay nonzerorow += (n>0); 8227fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 82349b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 82449b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 82549b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 826831a3094SHong Zhang v += 9; jmin++; 8277fbae186SHong Zhang } 828444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 829444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 830831a3094SHong Zhang for (j=jmin; j<n; j++) { 83149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 83249b5e25fSSatish Balay cval = ib[j]*3; 83349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 83449b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 83549b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 83649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 83749b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 83849b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 83949b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 84049b5e25fSSatish Balay v += 9; 84149b5e25fSSatish Balay } 84249b5e25fSSatish Balay xb +=3; ai++; 84349b5e25fSSatish Balay } 84449b5e25fSSatish Balay 845d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 846bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 84749b5e25fSSatish Balay 848dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 84949b5e25fSSatish Balay PetscFunctionReturn(0); 85049b5e25fSSatish Balay } 85149b5e25fSSatish Balay 852dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 85349b5e25fSSatish Balay { 85449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 855d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4; 856d9ca1df4SBarry Smith const PetscScalar *x,*xb; 857d9ca1df4SBarry Smith const MatScalar *v; 8586849ba73SBarry Smith PetscErrorCode ierr; 859d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 860d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 86198c9bda7SSatish Balay PetscInt nonzerorow=0; 86249b5e25fSSatish Balay 86349b5e25fSSatish Balay PetscFunctionBegin; 864343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 865d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8661ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 86749b5e25fSSatish Balay 86849b5e25fSSatish Balay v = a->a; 86949b5e25fSSatish Balay xb = x; 87049b5e25fSSatish Balay 87149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 87249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 87349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 87449b5e25fSSatish Balay ib = aj + *ai; 875831a3094SHong Zhang jmin = 0; 87698c9bda7SSatish Balay nonzerorow += (n>0); 8777fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 87849b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 87949b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 88049b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 88149b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 882831a3094SHong Zhang v += 16; jmin++; 8837fbae186SHong Zhang } 884444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 885444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 886831a3094SHong Zhang for (j=jmin; j<n; j++) { 88749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 88849b5e25fSSatish Balay cval = ib[j]*4; 88949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 89049b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 89149b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 89249b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 89349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 89449b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 89549b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 89649b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 89749b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 89849b5e25fSSatish Balay v += 16; 89949b5e25fSSatish Balay } 90049b5e25fSSatish Balay xb +=4; ai++; 90149b5e25fSSatish Balay } 90249b5e25fSSatish Balay 903d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 904bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 90549b5e25fSSatish Balay 906dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 90749b5e25fSSatish Balay PetscFunctionReturn(0); 90849b5e25fSSatish Balay } 90949b5e25fSSatish Balay 910dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 91149b5e25fSSatish Balay { 91249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 913d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5; 914d9ca1df4SBarry Smith const PetscScalar *x,*xb; 915d9ca1df4SBarry Smith const MatScalar *v; 9166849ba73SBarry Smith PetscErrorCode ierr; 917d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 918d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 91998c9bda7SSatish Balay PetscInt nonzerorow=0; 92049b5e25fSSatish Balay 92149b5e25fSSatish Balay PetscFunctionBegin; 922343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 923d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9241ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 92549b5e25fSSatish Balay 92649b5e25fSSatish Balay v = a->a; 92749b5e25fSSatish Balay xb = x; 92849b5e25fSSatish Balay 92949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 93049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 93149b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 93249b5e25fSSatish Balay ib = aj + *ai; 933831a3094SHong Zhang jmin = 0; 93498c9bda7SSatish Balay nonzerorow += (n>0); 9357fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 93649b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 93749b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 93849b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 93949b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 94049b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 941831a3094SHong Zhang v += 25; jmin++; 9427fbae186SHong Zhang } 943444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 944444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 945831a3094SHong Zhang for (j=jmin; j<n; j++) { 94649b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 94749b5e25fSSatish Balay cval = ib[j]*5; 94849b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 94949b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 95049b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 95149b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 95249b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 95349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 95449b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 95549b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 95649b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 95749b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 95849b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 95949b5e25fSSatish Balay v += 25; 96049b5e25fSSatish Balay } 96149b5e25fSSatish Balay xb +=5; ai++; 96249b5e25fSSatish Balay } 96349b5e25fSSatish Balay 964d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 965bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 96649b5e25fSSatish Balay 967dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 96849b5e25fSSatish Balay PetscFunctionReturn(0); 96949b5e25fSSatish Balay } 970dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 97149b5e25fSSatish Balay { 97249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 973d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6; 974d9ca1df4SBarry Smith const PetscScalar *x,*xb; 975d9ca1df4SBarry Smith const MatScalar *v; 9766849ba73SBarry Smith PetscErrorCode ierr; 977d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 978d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 97998c9bda7SSatish Balay PetscInt nonzerorow=0; 98049b5e25fSSatish Balay 98149b5e25fSSatish Balay PetscFunctionBegin; 982343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 983d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9841ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 98549b5e25fSSatish Balay 98649b5e25fSSatish Balay v = a->a; 98749b5e25fSSatish Balay xb = x; 98849b5e25fSSatish Balay 98949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 99049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 99149b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 99249b5e25fSSatish Balay ib = aj + *ai; 993831a3094SHong Zhang jmin = 0; 99498c9bda7SSatish Balay nonzerorow += (n>0); 9957fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 99649b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 99749b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 99849b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 99949b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 100049b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 100149b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1002831a3094SHong Zhang v += 36; jmin++; 10037fbae186SHong Zhang } 1004444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1005444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1006831a3094SHong Zhang for (j=jmin; j<n; j++) { 100749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 100849b5e25fSSatish Balay cval = ib[j]*6; 100949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 101049b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 101149b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 101249b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 101349b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 101449b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 101549b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 101649b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 101749b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 101849b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 101949b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 102049b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 102149b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 102249b5e25fSSatish Balay v += 36; 102349b5e25fSSatish Balay } 102449b5e25fSSatish Balay xb +=6; ai++; 102549b5e25fSSatish Balay } 102649b5e25fSSatish Balay 1027d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1028bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 102949b5e25fSSatish Balay 1030dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 103149b5e25fSSatish Balay PetscFunctionReturn(0); 103249b5e25fSSatish Balay } 103349b5e25fSSatish Balay 1034dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 103549b5e25fSSatish Balay { 103649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1037d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1038d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1039d9ca1df4SBarry Smith const MatScalar *v; 10406849ba73SBarry Smith PetscErrorCode ierr; 1041d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1042d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 104398c9bda7SSatish Balay PetscInt nonzerorow=0; 104449b5e25fSSatish Balay 104549b5e25fSSatish Balay PetscFunctionBegin; 1046343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1047d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10481ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 104949b5e25fSSatish Balay 105049b5e25fSSatish Balay v = a->a; 105149b5e25fSSatish Balay xb = x; 105249b5e25fSSatish Balay 105349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 105449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 105549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 105649b5e25fSSatish Balay ib = aj + *ai; 1057831a3094SHong Zhang jmin = 0; 105898c9bda7SSatish Balay nonzerorow += (n>0); 10597fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 106049b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 106149b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 106249b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 106349b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 106449b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 106549b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 106649b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1067831a3094SHong Zhang v += 49; jmin++; 10687fbae186SHong Zhang } 1069444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1070444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1071831a3094SHong Zhang for (j=jmin; j<n; j++) { 107249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 107349b5e25fSSatish Balay cval = ib[j]*7; 107449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 107549b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 107649b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 107749b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 107849b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 107949b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 108049b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 108149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 108249b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 108349b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 108449b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 108549b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 108649b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 108749b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 108849b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 108949b5e25fSSatish Balay v += 49; 109049b5e25fSSatish Balay } 109149b5e25fSSatish Balay xb +=7; ai++; 109249b5e25fSSatish Balay } 109349b5e25fSSatish Balay 1094d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1095bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 109649b5e25fSSatish Balay 1097dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 109849b5e25fSSatish Balay PetscFunctionReturn(0); 109949b5e25fSSatish Balay } 110049b5e25fSSatish Balay 1101dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 110249b5e25fSSatish Balay { 110349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1104d9ca1df4SBarry Smith PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1105d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 1106d9ca1df4SBarry Smith const MatScalar *v; 1107dfbe8321SBarry Smith PetscErrorCode ierr; 1108d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1109d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 111098c9bda7SSatish Balay PetscInt nonzerorow=0; 111149b5e25fSSatish Balay 111249b5e25fSSatish Balay PetscFunctionBegin; 1113343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1114d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 11151ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 111649b5e25fSSatish Balay 111749b5e25fSSatish Balay aj = a->j; 111849b5e25fSSatish Balay v = a->a; 111949b5e25fSSatish Balay ii = a->i; 112049b5e25fSSatish Balay 112149b5e25fSSatish Balay if (!a->mult_work) { 1122854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 112349b5e25fSSatish Balay } 112449b5e25fSSatish Balay work = a->mult_work; 112549b5e25fSSatish Balay 112649b5e25fSSatish Balay 112749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 112849b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 112949b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 113098c9bda7SSatish Balay nonzerorow += (n>0); 113149b5e25fSSatish Balay 113249b5e25fSSatish Balay /* upper triangular part */ 113349b5e25fSSatish Balay for (j=0; j<n; j++) { 113449b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 113549b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 113649b5e25fSSatish Balay workt += bs; 113749b5e25fSSatish Balay } 113849b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 113996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 114049b5e25fSSatish Balay 114149b5e25fSSatish Balay /* strict lower triangular part */ 1142831a3094SHong Zhang idx = aj+ii[0]; 1143831a3094SHong Zhang if (*idx == i) { 114496b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1145831a3094SHong Zhang } 114649b5e25fSSatish Balay if (ncols > 0) { 114749b5e25fSSatish Balay workt = work; 114887828ca2SBarry Smith ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 114996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1150831a3094SHong Zhang for (j=0; j<n; j++) { 1151831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 115249b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 115349b5e25fSSatish Balay workt += bs; 115449b5e25fSSatish Balay } 115549b5e25fSSatish Balay } 115649b5e25fSSatish Balay 115749b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 115849b5e25fSSatish Balay } 115949b5e25fSSatish Balay 1160d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1161bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 116249b5e25fSSatish Balay 1163dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 116449b5e25fSSatish Balay PetscFunctionReturn(0); 116549b5e25fSSatish Balay } 116649b5e25fSSatish Balay 1167f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 116849b5e25fSSatish Balay { 116949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1170f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1171efee365bSSatish Balay PetscErrorCode ierr; 1172c5df96a5SBarry Smith PetscBLASInt one = 1,totalnz; 117349b5e25fSSatish Balay 117449b5e25fSSatish Balay PetscFunctionBegin; 1175c5df96a5SBarry Smith ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 11768b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1177efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 117849b5e25fSSatish Balay PetscFunctionReturn(0); 117949b5e25fSSatish Balay } 118049b5e25fSSatish Balay 1181dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 118249b5e25fSSatish Balay { 118349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1184d9ca1df4SBarry Smith const MatScalar *v = a->a; 118549b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1186d9ca1df4SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 11876849ba73SBarry Smith PetscErrorCode ierr; 1188d9ca1df4SBarry Smith const PetscInt *aj=a->j,*col; 118949b5e25fSSatish Balay 119049b5e25fSSatish Balay PetscFunctionBegin; 119149b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 119249b5e25fSSatish Balay for (k=0; k<mbs; k++) { 119349b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1194831a3094SHong Zhang col = aj + jmin; 1195831a3094SHong Zhang if (*col == k) { /* diagonal block */ 119649b5e25fSSatish Balay for (i=0; i<bs2; i++) { 119749b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 119849b5e25fSSatish Balay } 1199831a3094SHong Zhang jmin++; 1200831a3094SHong Zhang } 1201831a3094SHong Zhang for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 120249b5e25fSSatish Balay for (i=0; i<bs2; i++) { 120349b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 120449b5e25fSSatish Balay } 120549b5e25fSSatish Balay } 120649b5e25fSSatish Balay } 12078f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2*sum_off); 120851f70360SJed Brown ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 12090b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1210dcca6d9dSJed Brown ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 12110b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 12120b8dc8d2SHong Zhang il[0] = 0; 121349b5e25fSSatish Balay 121449b5e25fSSatish Balay *norm = 0.0; 121549b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 121649b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 121749b5e25fSSatish Balay /*-- col sum --*/ 121849b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1219831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1220831a3094SHong Zhang at step k */ 122149b5e25fSSatish Balay while (i<mbs) { 122249b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 122349b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 122449b5e25fSSatish Balay for (j=0; j<bs; j++) { 122549b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 122649b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 122749b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 122849b5e25fSSatish Balay } 122949b5e25fSSatish Balay } 123049b5e25fSSatish Balay /* update il, jl */ 1231831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1232831a3094SHong Zhang jmax = a->i[i+1]; 123349b5e25fSSatish Balay if (jmin < jmax) { 123449b5e25fSSatish Balay il[i] = jmin; 123549b5e25fSSatish Balay j = a->j[jmin]; 123649b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 123749b5e25fSSatish Balay } 123849b5e25fSSatish Balay i = nexti; 123949b5e25fSSatish Balay } 124049b5e25fSSatish Balay /*-- row sum --*/ 124149b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 124249b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 124349b5e25fSSatish Balay for (j=0; j<bs; j++) { 124449b5e25fSSatish Balay v = a->a + i*bs2 + j; 124549b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 12460b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 124749b5e25fSSatish Balay } 124849b5e25fSSatish Balay } 124949b5e25fSSatish Balay } 125049b5e25fSSatish Balay /* add k_th block row to il, jl */ 1251831a3094SHong Zhang col = aj+jmin; 1252831a3094SHong Zhang if (*col == k) jmin++; 125349b5e25fSSatish Balay if (jmin < jmax) { 125449b5e25fSSatish Balay il[k] = jmin; 12550b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 125649b5e25fSSatish Balay } 125749b5e25fSSatish Balay for (j=0; j<bs; j++) { 125849b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 125949b5e25fSSatish Balay } 126049b5e25fSSatish Balay } 126174ed9c26SBarry Smith ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 126251f70360SJed Brown ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1263f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 126449b5e25fSSatish Balay PetscFunctionReturn(0); 126549b5e25fSSatish Balay } 126649b5e25fSSatish Balay 1267ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 126849b5e25fSSatish Balay { 126949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1270dfbe8321SBarry Smith PetscErrorCode ierr; 127149b5e25fSSatish Balay 127249b5e25fSSatish Balay PetscFunctionBegin; 127349b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1274d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1275ef511fbeSHong Zhang *flg = PETSC_FALSE; 1276ef511fbeSHong Zhang PetscFunctionReturn(0); 127749b5e25fSSatish Balay } 127849b5e25fSSatish Balay 127949b5e25fSSatish Balay /* if the a->i are the same */ 128013f74950SBarry Smith ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 128126fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 128249b5e25fSSatish Balay 128349b5e25fSSatish Balay /* if a->j are the same */ 128413f74950SBarry Smith ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 128526fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 128626fbe8dcSKarl Rupp 128749b5e25fSSatish Balay /* if a->a are the same */ 1288d0f46423SBarry Smith ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1289935af2e7SHong Zhang PetscFunctionReturn(0); 129049b5e25fSSatish Balay } 129149b5e25fSSatish Balay 1292dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 129349b5e25fSSatish Balay { 129449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1295dfbe8321SBarry Smith PetscErrorCode ierr; 1296d9ca1df4SBarry Smith PetscInt i,j,k,row,bs,ambs,bs2; 1297d9ca1df4SBarry Smith const PetscInt *ai,*aj; 129887828ca2SBarry Smith PetscScalar *x,zero = 0.0; 1299d9ca1df4SBarry Smith const MatScalar *aa,*aa_j; 130049b5e25fSSatish Balay 130149b5e25fSSatish Balay PetscFunctionBegin; 1302d0f46423SBarry Smith bs = A->rmap->bs; 1303e32f2f54SBarry Smith if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 130482799104SHong Zhang 130549b5e25fSSatish Balay aa = a->a; 13068a0c6efdSHong Zhang ambs = a->mbs; 13078a0c6efdSHong Zhang 13088a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13098a0c6efdSHong Zhang PetscInt *diag=a->diag; 13108a0c6efdSHong Zhang aa = a->a; 13118a0c6efdSHong Zhang ambs = a->mbs; 13128a0c6efdSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 13138a0c6efdSHong Zhang for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 13148a0c6efdSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 13158a0c6efdSHong Zhang PetscFunctionReturn(0); 13168a0c6efdSHong Zhang } 13178a0c6efdSHong Zhang 131849b5e25fSSatish Balay ai = a->i; 131949b5e25fSSatish Balay aj = a->j; 132049b5e25fSSatish Balay bs2 = a->bs2; 13212dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 13221ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 132349b5e25fSSatish Balay for (i=0; i<ambs; i++) { 132449b5e25fSSatish Balay j=ai[i]; 132549b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 132649b5e25fSSatish Balay row = i*bs; 132749b5e25fSSatish Balay aa_j = aa + j*bs2; 132849b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 132949b5e25fSSatish Balay } 133049b5e25fSSatish Balay } 13311ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 133249b5e25fSSatish Balay PetscFunctionReturn(0); 133349b5e25fSSatish Balay } 133449b5e25fSSatish Balay 1335dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 133649b5e25fSSatish Balay { 133749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1338d9ca1df4SBarry Smith PetscScalar x; 1339d9ca1df4SBarry Smith const PetscScalar *l,*li,*ri; 134049b5e25fSSatish Balay MatScalar *aa,*v; 1341dfbe8321SBarry Smith PetscErrorCode ierr; 1342*fff8e43fSBarry Smith PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2; 1343*fff8e43fSBarry Smith const PetscInt *ai,*aj; 1344ace3abfcSBarry Smith PetscBool flg; 134549b5e25fSSatish Balay 134649b5e25fSSatish Balay PetscFunctionBegin; 1347b3bf805bSHong Zhang if (ll != rr) { 1348b3bf805bSHong Zhang ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1349e7e72b3dSBarry Smith if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1350b3bf805bSHong Zhang } 1351b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 135249b5e25fSSatish Balay ai = a->i; 135349b5e25fSSatish Balay aj = a->j; 135449b5e25fSSatish Balay aa = a->a; 1355d0f46423SBarry Smith m = A->rmap->N; 1356d0f46423SBarry Smith bs = A->rmap->bs; 135749b5e25fSSatish Balay mbs = a->mbs; 135849b5e25fSSatish Balay bs2 = a->bs2; 135949b5e25fSSatish Balay 1360d9ca1df4SBarry Smith ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 136149b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1362e32f2f54SBarry Smith if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 136349b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 136449b5e25fSSatish Balay M = ai[i+1] - ai[i]; 136549b5e25fSSatish Balay li = l + i*bs; 136649b5e25fSSatish Balay v = aa + bs2*ai[i]; 136749b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 136849b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 13695e90f9d9SHong Zhang for (k=0; k<bs; k++) { 137049b5e25fSSatish Balay x = ri[k]; 137149b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 137249b5e25fSSatish Balay } 137349b5e25fSSatish Balay } 137449b5e25fSSatish Balay } 1375d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1376dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 137749b5e25fSSatish Balay PetscFunctionReturn(0); 137849b5e25fSSatish Balay } 137949b5e25fSSatish Balay 1380dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 138149b5e25fSSatish Balay { 138249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 138349b5e25fSSatish Balay 138449b5e25fSSatish Balay PetscFunctionBegin; 138549b5e25fSSatish Balay info->block_size = a->bs2; 1386ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 13876c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 138849b5e25fSSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 138949b5e25fSSatish Balay info->assemblies = A->num_ass; 13908e58a170SBarry Smith info->mallocs = A->info.mallocs; 13917adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1392d5f3da31SBarry Smith if (A->factortype) { 139349b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 139449b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 139549b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 139649b5e25fSSatish Balay } else { 139749b5e25fSSatish Balay info->fill_ratio_given = 0; 139849b5e25fSSatish Balay info->fill_ratio_needed = 0; 139949b5e25fSSatish Balay info->factor_mallocs = 0; 140049b5e25fSSatish Balay } 140149b5e25fSSatish Balay PetscFunctionReturn(0); 140249b5e25fSSatish Balay } 140349b5e25fSSatish Balay 140449b5e25fSSatish Balay 1405dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 140649b5e25fSSatish Balay { 140749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1408dfbe8321SBarry Smith PetscErrorCode ierr; 140949b5e25fSSatish Balay 141049b5e25fSSatish Balay PetscFunctionBegin; 141149b5e25fSSatish Balay ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 141249b5e25fSSatish Balay PetscFunctionReturn(0); 141349b5e25fSSatish Balay } 1414dc354874SHong Zhang 1415985db425SBarry Smith /* 1416985db425SBarry Smith This code does not work since it only checks the upper triangular part of 1417985db425SBarry Smith the matrix. Hence it is not listed in the function table. 1418985db425SBarry Smith */ 1419985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1420dc354874SHong Zhang { 1421dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1422dfbe8321SBarry Smith PetscErrorCode ierr; 1423d9ca1df4SBarry Smith PetscInt i,j,n,row,col,bs,mbs; 1424d9ca1df4SBarry Smith const PetscInt *ai,*aj; 1425c3fca9a7SHong Zhang PetscReal atmp; 1426d9ca1df4SBarry Smith const MatScalar *aa; 1427985db425SBarry Smith PetscScalar *x; 142813f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1429dc354874SHong Zhang 1430dc354874SHong Zhang PetscFunctionBegin; 1431e32f2f54SBarry Smith if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1432e32f2f54SBarry Smith if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1433d0f46423SBarry Smith bs = A->rmap->bs; 1434dc354874SHong Zhang aa = a->a; 1435dc354874SHong Zhang ai = a->i; 1436dc354874SHong Zhang aj = a->j; 143744117c81SHong Zhang mbs = a->mbs; 1438dc354874SHong Zhang 1439985db425SBarry Smith ierr = VecSet(v,0.0);CHKERRQ(ierr); 14401ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1441dc354874SHong Zhang ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1442e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 144344117c81SHong Zhang for (i=0; i<mbs; i++) { 1444d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1445d0f6400bSHong Zhang brow = bs*i; 144644117c81SHong Zhang for (j=0; j<ncols; j++) { 1447d0f6400bSHong Zhang bcol = bs*(*aj); 144844117c81SHong Zhang for (kcol=0; kcol<bs; kcol++) { 1449d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 145044117c81SHong Zhang for (krow=0; krow<bs; krow++) { 1451d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1452d0f6400bSHong Zhang row = brow + krow; /* row index */ 1453c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1454c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 145544117c81SHong Zhang } 145644117c81SHong Zhang } 1457d0f6400bSHong Zhang aj++; 1458dc354874SHong Zhang } 1459dc354874SHong Zhang } 14601ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1461dc354874SHong Zhang PetscFunctionReturn(0); 1462dc354874SHong Zhang } 1463