149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 913f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 1049b5e25fSSatish Balay { 115eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 126849ba73SBarry Smith PetscErrorCode ierr; 135d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 145d0c19d7SBarry Smith const PetscInt *idx; 15db41cccfSHong Zhang PetscBT table_out,table_in; 16d94109b8SHong Zhang 17d94109b8SHong Zhang PetscFunctionBegin; 18e32f2f54SBarry Smith if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 19d94109b8SHong Zhang mbs = a->mbs; 20d94109b8SHong Zhang ai = a->i; 21d94109b8SHong Zhang aj = a->j; 22d0f46423SBarry Smith bs = A->rmap->bs; 2353b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 24854ce69bSBarry Smith ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 25854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 2653b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 27d94109b8SHong Zhang 28d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 29d94109b8SHong Zhang isz = 0; 30db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 31d94109b8SHong Zhang 32d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 33d94109b8SHong Zhang ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 34d94109b8SHong Zhang ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 35d94109b8SHong Zhang 36db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 37dbe03f88SHong Zhang bcol_max = 0; 38d94109b8SHong Zhang for (j=0; j<n; ++j) { 39d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 40e32f2f54SBarry Smith if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) { 42dbe03f88SHong Zhang nidx[isz++] = brow; 43dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 44dbe03f88SHong Zhang } 45d94109b8SHong Zhang } 46d94109b8SHong Zhang ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 476bf464f9SBarry Smith ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 48d94109b8SHong Zhang 49d94109b8SHong Zhang k = 0; 50d94109b8SHong Zhang for (j=0; j<ov; j++) { /* for each overlap */ 51db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 52db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 53db41cccfSHong Zhang for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 54d94109b8SHong Zhang 55d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 56d94109b8SHong Zhang for (brow=0; brow<mbs; brow++) { 57d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 58db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 59d94109b8SHong Zhang for (l = start; l<end; l++) { 60d94109b8SHong Zhang bcol = aj[l]; 61d7b97159SHong Zhang if (!PetscBTLookupSet(table_out,bcol)) { 62d7b97159SHong Zhang nidx[isz++] = bcol; 63d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 64d7b97159SHong Zhang } 65d94109b8SHong Zhang } 66d94109b8SHong Zhang k++; 67d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 68d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 69d94109b8SHong Zhang for (l = start; l<end; l++) { 70d94109b8SHong Zhang bcol = aj[l]; 71dbe03f88SHong Zhang if (bcol > bcol_max) break; 72db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)) { 7326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 74d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } 79d94109b8SHong Zhang } /* for each overlap */ 80d94109b8SHong Zhang 81d94109b8SHong Zhang /* expand the Index Set */ 82d94109b8SHong Zhang for (j=0; j<isz; j++) { 8326fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 84d94109b8SHong Zhang } 8570b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 86d94109b8SHong Zhang } 8794bacf5dSBarry Smith ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 88d94109b8SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 89d94109b8SHong Zhang ierr = PetscFree(nidx2);CHKERRQ(ierr); 9094bacf5dSBarry Smith ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 915eee224dSHong Zhang PetscFunctionReturn(0); 9249b5e25fSSatish Balay } 9349b5e25fSSatish Balay 94847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 95847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 96847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 9749b5e25fSSatish Balay { 986849ba73SBarry Smith PetscErrorCode ierr; 9949b5e25fSSatish Balay 10049b5e25fSSatish Balay PetscFunctionBegin; 101847daeccSHong Zhang ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 102847daeccSHong Zhang Bseq->ops->mult = 0; 103847daeccSHong Zhang Bseq->ops->multadd = 0; 104847daeccSHong Zhang Bseq->ops->multtranspose = 0; 105847daeccSHong Zhang Bseq->ops->multtransposeadd = 0; 106847daeccSHong Zhang Bseq->ops->lufactor = 0; 107847daeccSHong Zhang Bseq->ops->choleskyfactor = 0; 108847daeccSHong Zhang Bseq->ops->lufactorsymbolic = 0; 109847daeccSHong Zhang Bseq->ops->choleskyfactorsymbolic = 0; 110847daeccSHong Zhang Bseq->ops->getinertia = 0; 11149b5e25fSSatish Balay PetscFunctionReturn(0); 11249b5e25fSSatish Balay } 11349b5e25fSSatish Balay 1147dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 1157dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 116e50a8114SHong Zhang { 117e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 118e50a8114SHong Zhang PetscErrorCode ierr; 119e50a8114SHong Zhang PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 120e50a8114SHong Zhang PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 121e50a8114SHong Zhang const PetscInt *irow,*icol; 122e50a8114SHong Zhang PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 123e50a8114SHong Zhang PetscInt *aj = a->j,*ai = a->i; 124e50a8114SHong Zhang MatScalar *mat_a; 125e50a8114SHong Zhang Mat C; 126e50a8114SHong Zhang PetscBool flag; 127e50a8114SHong Zhang 128e50a8114SHong Zhang PetscFunctionBegin; 129e50a8114SHong Zhang 130e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 131e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 132e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 133e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 134e50a8114SHong Zhang 135e50a8114SHong Zhang ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 136e50a8114SHong Zhang ssmap = smap; 137e50a8114SHong Zhang ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 138e50a8114SHong Zhang for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 139e50a8114SHong Zhang /* determine lens of each row */ 140e50a8114SHong Zhang for (i=0; i<nrows; i++) { 141e50a8114SHong Zhang kstart = ai[irow[i]]; 142e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 143e50a8114SHong Zhang lens[i] = 0; 144e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 145e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 146e50a8114SHong Zhang } 147e50a8114SHong Zhang } 148e50a8114SHong Zhang /* Create and fill new matrix */ 149e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 150e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)((*B)->data); 151e50a8114SHong Zhang 152e50a8114SHong Zhang if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 153580bdb30SBarry Smith ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 154e50a8114SHong Zhang if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 155580bdb30SBarry Smith ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 156e50a8114SHong Zhang C = *B; 157e50a8114SHong Zhang } else { 158e50a8114SHong Zhang ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 159e50a8114SHong Zhang ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 160e50a8114SHong Zhang ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 161367daffbSBarry Smith ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 162e50a8114SHong Zhang } 163e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)(C->data); 164e50a8114SHong Zhang for (i=0; i<nrows; i++) { 165e50a8114SHong Zhang row = irow[i]; 166e50a8114SHong Zhang kstart = ai[row]; 167e50a8114SHong Zhang kend = kstart + a->ilen[row]; 168e50a8114SHong Zhang mat_i = c->i[i]; 169e50a8114SHong Zhang mat_j = c->j + mat_i; 170e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 171e50a8114SHong Zhang mat_ilen = c->ilen + i; 172e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 173e50a8114SHong Zhang if ((tcol=ssmap[a->j[k]])) { 174e50a8114SHong Zhang *mat_j++ = tcol - 1; 175580bdb30SBarry Smith ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 176e50a8114SHong Zhang mat_a += bs2; 177e50a8114SHong Zhang (*mat_ilen)++; 178e50a8114SHong Zhang } 179e50a8114SHong Zhang } 180e50a8114SHong Zhang } 181e50a8114SHong Zhang /* sort */ 182e50a8114SHong Zhang { 183e50a8114SHong Zhang MatScalar *work; 184e50a8114SHong Zhang 185e50a8114SHong Zhang ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 186e50a8114SHong Zhang for (i=0; i<nrows; i++) { 187e50a8114SHong Zhang PetscInt ilen; 188e50a8114SHong Zhang mat_i = c->i[i]; 189e50a8114SHong Zhang mat_j = c->j + mat_i; 190e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 191e50a8114SHong Zhang ilen = c->ilen[i]; 192e50a8114SHong Zhang ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 193e50a8114SHong Zhang } 194e50a8114SHong Zhang ierr = PetscFree(work);CHKERRQ(ierr); 195e50a8114SHong Zhang } 196e50a8114SHong Zhang 197e50a8114SHong Zhang /* Free work space */ 198e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 199e50a8114SHong Zhang ierr = PetscFree(smap);CHKERRQ(ierr); 200e50a8114SHong Zhang ierr = PetscFree(lens);CHKERRQ(ierr); 201e50a8114SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 202e50a8114SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 203e50a8114SHong Zhang 204e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205e50a8114SHong Zhang *B = C; 206e50a8114SHong Zhang PetscFunctionReturn(0); 207e50a8114SHong Zhang } 208e50a8114SHong Zhang 2097dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 21049b5e25fSSatish Balay { 211e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 212e50a8114SHong Zhang IS is1,is2; 2136849ba73SBarry Smith PetscErrorCode ierr; 214e50a8114SHong Zhang PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 215e50a8114SHong Zhang const PetscInt *irow,*icol; 21649b5e25fSSatish Balay 21749b5e25fSSatish Balay PetscFunctionBegin; 218e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 219e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 220e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 221e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 222e50a8114SHong Zhang 223e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 224e50a8114SHong Zhang and form the IS with compressed IS */ 225e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs,a->nbs); 226e50a8114SHong Zhang ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 227580bdb30SBarry Smith ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 228e50a8114SHong Zhang for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 229e50a8114SHong Zhang for (i=0; i<a->mbs; i++) { 230e50a8114SHong Zhang if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 231e50a8114SHong Zhang } 232e50a8114SHong Zhang count = 0; 233e50a8114SHong Zhang for (i=0; i<nrows; i++) { 234e50a8114SHong Zhang PetscInt j = irow[i] / bs; 235e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 236e50a8114SHong Zhang } 237e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 238e50a8114SHong Zhang 239580bdb30SBarry Smith ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 240e50a8114SHong Zhang for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 241e50a8114SHong Zhang for (i=0; i<a->nbs; i++) { 242e50a8114SHong Zhang if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 243e50a8114SHong Zhang } 244e50a8114SHong Zhang count = 0; 245e50a8114SHong Zhang for (i=0; i<ncols; i++) { 246e50a8114SHong Zhang PetscInt j = icol[i] / bs; 247e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 248e50a8114SHong Zhang } 249e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 250e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 251e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 252e50a8114SHong Zhang ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 253e50a8114SHong Zhang 2547dae84e0SHong Zhang ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 255e50a8114SHong Zhang ierr = ISDestroy(&is1);CHKERRQ(ierr); 256e50a8114SHong Zhang ierr = ISDestroy(&is2);CHKERRQ(ierr); 257847daeccSHong Zhang 2588f46ffcaSHong Zhang if (isrow != iscol) { 2598f46ffcaSHong Zhang PetscBool isequal; 2608f46ffcaSHong Zhang ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 261847daeccSHong Zhang if (!isequal) { 262847daeccSHong Zhang ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 2638f46ffcaSHong Zhang } 26449b5e25fSSatish Balay } 26549b5e25fSSatish Balay PetscFunctionReturn(0); 26649b5e25fSSatish Balay } 26749b5e25fSSatish Balay 2687dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 26949b5e25fSSatish Balay { 2706849ba73SBarry Smith PetscErrorCode ierr; 27113f74950SBarry Smith PetscInt i; 27249b5e25fSSatish Balay 27349b5e25fSSatish Balay PetscFunctionBegin; 274e50a8114SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 275df750dc8SHong Zhang ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 276afebec48SHong Zhang } 277e50a8114SHong Zhang 278e50a8114SHong Zhang for (i=0; i<n; i++) { 2797dae84e0SHong Zhang ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 28049b5e25fSSatish Balay } 28149b5e25fSSatish Balay PetscFunctionReturn(0); 28249b5e25fSSatish Balay } 28349b5e25fSSatish Balay 28449b5e25fSSatish Balay /* -------------------------------------------------------*/ 28549b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 28649b5e25fSSatish Balay /* -------------------------------------------------------*/ 28749b5e25fSSatish Balay 288dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 28949b5e25fSSatish Balay { 29049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 291d9ca1df4SBarry Smith PetscScalar *z,x1,x2,zero=0.0; 292d9ca1df4SBarry Smith const PetscScalar *x,*xb; 293d9ca1df4SBarry Smith const MatScalar *v; 2946849ba73SBarry Smith PetscErrorCode ierr; 295d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 296d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 29798c9bda7SSatish Balay PetscInt nonzerorow=0; 29849b5e25fSSatish Balay 29949b5e25fSSatish Balay PetscFunctionBegin; 3002dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 301c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 302d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3031ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 30449b5e25fSSatish Balay 30549b5e25fSSatish Balay v = a->a; 30649b5e25fSSatish Balay xb = x; 30749b5e25fSSatish Balay 30849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 30949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 31049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 31149b5e25fSSatish Balay ib = aj + *ai; 312831a3094SHong Zhang jmin = 0; 31398c9bda7SSatish Balay nonzerorow += (n>0); 3147fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 31549b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 31649b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 317831a3094SHong Zhang v += 4; jmin++; 3187fbae186SHong Zhang } 319444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 320444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 321831a3094SHong Zhang for (j=jmin; j<n; j++) { 32249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32349b5e25fSSatish Balay cval = ib[j]*2; 32449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 32549b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 32649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32749b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 32849b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 32949b5e25fSSatish Balay v += 4; 33049b5e25fSSatish Balay } 33149b5e25fSSatish Balay xb +=2; ai++; 33249b5e25fSSatish Balay } 33349b5e25fSSatish Balay 334d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3351ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 336dc0b31edSSatish Balay ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 33749b5e25fSSatish Balay PetscFunctionReturn(0); 33849b5e25fSSatish Balay } 33949b5e25fSSatish Balay 340dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 34149b5e25fSSatish Balay { 34249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 343d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,zero=0.0; 344d9ca1df4SBarry Smith const PetscScalar *x,*xb; 345d9ca1df4SBarry Smith const MatScalar *v; 3466849ba73SBarry Smith PetscErrorCode ierr; 347d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 348d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 34998c9bda7SSatish Balay PetscInt nonzerorow=0; 35049b5e25fSSatish Balay 35149b5e25fSSatish Balay PetscFunctionBegin; 3522dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 353c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 354d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3551ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 35649b5e25fSSatish Balay 35749b5e25fSSatish Balay v = a->a; 35849b5e25fSSatish Balay xb = x; 35949b5e25fSSatish Balay 36049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 36149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 36249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 36349b5e25fSSatish Balay ib = aj + *ai; 364831a3094SHong Zhang jmin = 0; 36598c9bda7SSatish Balay nonzerorow += (n>0); 3667fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 36749b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 36849b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 36949b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 370831a3094SHong Zhang v += 9; jmin++; 3717fbae186SHong Zhang } 372444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 373444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 374831a3094SHong Zhang for (j=jmin; j<n; j++) { 37549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 37649b5e25fSSatish Balay cval = ib[j]*3; 37749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 37849b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 37949b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 38049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 38149b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 38249b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 38349b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 38449b5e25fSSatish Balay v += 9; 38549b5e25fSSatish Balay } 38649b5e25fSSatish Balay xb +=3; ai++; 38749b5e25fSSatish Balay } 38849b5e25fSSatish Balay 389d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3901ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 391dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 39249b5e25fSSatish Balay PetscFunctionReturn(0); 39349b5e25fSSatish Balay } 39449b5e25fSSatish Balay 395dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 39649b5e25fSSatish Balay { 39749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 398d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,zero=0.0; 399d9ca1df4SBarry Smith const PetscScalar *x,*xb; 400d9ca1df4SBarry Smith const MatScalar *v; 4016849ba73SBarry Smith PetscErrorCode ierr; 402d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 403d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 40498c9bda7SSatish Balay PetscInt nonzerorow = 0; 40549b5e25fSSatish Balay 40649b5e25fSSatish Balay PetscFunctionBegin; 4072dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 408c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 409d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4101ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 41149b5e25fSSatish Balay 41249b5e25fSSatish Balay v = a->a; 41349b5e25fSSatish Balay xb = x; 41449b5e25fSSatish Balay 41549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 41649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 41749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 41849b5e25fSSatish Balay ib = aj + *ai; 419831a3094SHong Zhang jmin = 0; 42098c9bda7SSatish Balay nonzerorow += (n>0); 4217fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 42249b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 42349b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 42449b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 42549b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 426831a3094SHong Zhang v += 16; jmin++; 4277fbae186SHong Zhang } 428444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 429444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 430831a3094SHong Zhang for (j=jmin; j<n; j++) { 43149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 43249b5e25fSSatish Balay cval = ib[j]*4; 43349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 43449b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 43549b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 43649b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 43749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 43849b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 43949b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 44049b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 44149b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 44249b5e25fSSatish Balay v += 16; 44349b5e25fSSatish Balay } 44449b5e25fSSatish Balay xb +=4; ai++; 44549b5e25fSSatish Balay } 44649b5e25fSSatish Balay 447d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4481ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 449dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 45049b5e25fSSatish Balay PetscFunctionReturn(0); 45149b5e25fSSatish Balay } 45249b5e25fSSatish Balay 453dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 45449b5e25fSSatish Balay { 45549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 456d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 457d9ca1df4SBarry Smith const PetscScalar *x,*xb; 458d9ca1df4SBarry Smith const MatScalar *v; 4596849ba73SBarry Smith PetscErrorCode ierr; 460d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 461d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 46298c9bda7SSatish Balay PetscInt nonzerorow=0; 46349b5e25fSSatish Balay 46449b5e25fSSatish Balay PetscFunctionBegin; 4652dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 466c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 467d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4681ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 46949b5e25fSSatish Balay 47049b5e25fSSatish Balay v = a->a; 47149b5e25fSSatish Balay xb = x; 47249b5e25fSSatish Balay 47349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 47449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 47549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 47649b5e25fSSatish Balay ib = aj + *ai; 477831a3094SHong Zhang jmin = 0; 47898c9bda7SSatish Balay nonzerorow += (n>0); 4797fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 48049b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 48149b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 48249b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 48349b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 48449b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 485831a3094SHong Zhang v += 25; jmin++; 4867fbae186SHong Zhang } 487444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 488444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 489831a3094SHong Zhang for (j=jmin; j<n; j++) { 49049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 49149b5e25fSSatish Balay cval = ib[j]*5; 49249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 49349b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 49449b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 49549b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 49649b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 49749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 49849b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 49949b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 50049b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 50149b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 50249b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 50349b5e25fSSatish Balay v += 25; 50449b5e25fSSatish Balay } 50549b5e25fSSatish Balay xb +=5; ai++; 50649b5e25fSSatish Balay } 50749b5e25fSSatish Balay 508d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5091ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 510dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 51149b5e25fSSatish Balay PetscFunctionReturn(0); 51249b5e25fSSatish Balay } 51349b5e25fSSatish Balay 514dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 51549b5e25fSSatish Balay { 51649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 517d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 518d9ca1df4SBarry Smith const PetscScalar *x,*xb; 519d9ca1df4SBarry Smith const MatScalar *v; 5206849ba73SBarry Smith PetscErrorCode ierr; 521d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 522d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 52398c9bda7SSatish Balay PetscInt nonzerorow=0; 52449b5e25fSSatish Balay 52549b5e25fSSatish Balay PetscFunctionBegin; 5262dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 527c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 528d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5291ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 53049b5e25fSSatish Balay 53149b5e25fSSatish Balay v = a->a; 53249b5e25fSSatish Balay xb = x; 53349b5e25fSSatish Balay 53449b5e25fSSatish Balay for (i=0; i<mbs; i++) { 53549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 53649b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 53749b5e25fSSatish Balay ib = aj + *ai; 538831a3094SHong Zhang jmin = 0; 53998c9bda7SSatish Balay nonzerorow += (n>0); 5407fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 54149b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 54249b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 54349b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 54449b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 54549b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 54649b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 547831a3094SHong Zhang v += 36; jmin++; 5487fbae186SHong Zhang } 549444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 550444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 551831a3094SHong Zhang for (j=jmin; j<n; j++) { 55249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 55349b5e25fSSatish Balay cval = ib[j]*6; 55449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 55549b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 55649b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 55749b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 55849b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 55949b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 56049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 56149b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 56249b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 56349b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 56449b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 56549b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 56649b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 56749b5e25fSSatish Balay v += 36; 56849b5e25fSSatish Balay } 56949b5e25fSSatish Balay xb +=6; ai++; 57049b5e25fSSatish Balay } 57149b5e25fSSatish Balay 572d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5731ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 574dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 57549b5e25fSSatish Balay PetscFunctionReturn(0); 57649b5e25fSSatish Balay } 577c2916339SPierre Jolivet 578dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 57949b5e25fSSatish Balay { 58049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 581d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 582d9ca1df4SBarry Smith const PetscScalar *x,*xb; 583d9ca1df4SBarry Smith const MatScalar *v; 5846849ba73SBarry Smith PetscErrorCode ierr; 585d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 586d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 58798c9bda7SSatish Balay PetscInt nonzerorow=0; 58849b5e25fSSatish Balay 58949b5e25fSSatish Balay PetscFunctionBegin; 5902dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 591c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 592d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5931ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 59449b5e25fSSatish Balay 59549b5e25fSSatish Balay v = a->a; 59649b5e25fSSatish Balay xb = x; 59749b5e25fSSatish Balay 59849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 59949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 60049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 60149b5e25fSSatish Balay ib = aj + *ai; 602831a3094SHong Zhang jmin = 0; 60398c9bda7SSatish Balay nonzerorow += (n>0); 6047fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 60549b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 60649b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 60749b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 60849b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 60949b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 61049b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 61149b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 612831a3094SHong Zhang v += 49; jmin++; 6137fbae186SHong Zhang } 614444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 615444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 616831a3094SHong Zhang for (j=jmin; j<n; j++) { 61749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 61849b5e25fSSatish Balay cval = ib[j]*7; 61949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 62049b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 62149b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 62249b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 62349b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 62449b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 62549b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 62649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 62749b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 62849b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 62949b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 63049b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 63149b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 63249b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 63349b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 63449b5e25fSSatish Balay v += 49; 63549b5e25fSSatish Balay } 63649b5e25fSSatish Balay xb +=7; ai++; 63749b5e25fSSatish Balay } 638d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 6391ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 640dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 64149b5e25fSSatish Balay PetscFunctionReturn(0); 64249b5e25fSSatish Balay } 64349b5e25fSSatish Balay 64449b5e25fSSatish Balay /* 64549b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 64649b5e25fSSatish Balay */ 647dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 64849b5e25fSSatish Balay { 64949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 650d9ca1df4SBarry Smith PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 651d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 652d9ca1df4SBarry Smith const MatScalar *v; 653dfbe8321SBarry Smith PetscErrorCode ierr; 654d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 655d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 65698c9bda7SSatish Balay PetscInt nonzerorow=0; 65749b5e25fSSatish Balay 65849b5e25fSSatish Balay PetscFunctionBegin; 6592dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 660c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 661d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 6621ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 66349b5e25fSSatish Balay 66449b5e25fSSatish Balay aj = a->j; 66549b5e25fSSatish Balay v = a->a; 66649b5e25fSSatish Balay ii = a->i; 66749b5e25fSSatish Balay 66849b5e25fSSatish Balay if (!a->mult_work) { 669854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 67049b5e25fSSatish Balay } 67149b5e25fSSatish Balay work = a->mult_work; 67249b5e25fSSatish Balay 67349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 67449b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 67549b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 67698c9bda7SSatish Balay nonzerorow += (n>0); 67749b5e25fSSatish Balay 67849b5e25fSSatish Balay /* upper triangular part */ 67949b5e25fSSatish Balay for (j=0; j<n; j++) { 68049b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 68149b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 68249b5e25fSSatish Balay workt += bs; 68349b5e25fSSatish Balay } 68449b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 68596b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 68649b5e25fSSatish Balay 68749b5e25fSSatish Balay /* strict lower triangular part */ 688831a3094SHong Zhang idx = aj+ii[0]; 689831a3094SHong Zhang if (*idx == i) { 69096b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 691831a3094SHong Zhang } 69296b9376eSHong Zhang 69349b5e25fSSatish Balay if (ncols > 0) { 69449b5e25fSSatish Balay workt = work; 695580bdb30SBarry Smith ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 69696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 697831a3094SHong Zhang for (j=0; j<n; j++) { 698831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 69949b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 70049b5e25fSSatish Balay workt += bs; 70149b5e25fSSatish Balay } 70249b5e25fSSatish Balay } 70349b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 70449b5e25fSSatish Balay } 70549b5e25fSSatish Balay 706d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 7071ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 708dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 70949b5e25fSSatish Balay PetscFunctionReturn(0); 71049b5e25fSSatish Balay } 71149b5e25fSSatish Balay 712dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 71349b5e25fSSatish Balay { 71449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 715d9ca1df4SBarry Smith PetscScalar *z,x1; 716d9ca1df4SBarry Smith const PetscScalar *x,*xb; 717d9ca1df4SBarry Smith const MatScalar *v; 7186849ba73SBarry Smith PetscErrorCode ierr; 719d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 720d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 72198c9bda7SSatish Balay PetscInt nonzerorow=0; 722eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 723eb1ec7c1SStefano Zampini const int aconj = A->hermitian; 724eb1ec7c1SStefano Zampini #else 725eb1ec7c1SStefano Zampini const int aconj = 0; 726eb1ec7c1SStefano Zampini #endif 72749b5e25fSSatish Balay 72849b5e25fSSatish Balay PetscFunctionBegin; 729343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 730c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 731d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7321ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 73349b5e25fSSatish Balay v = a->a; 73449b5e25fSSatish Balay xb = x; 73549b5e25fSSatish Balay 73649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 73749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 73849b5e25fSSatish Balay x1 = xb[0]; 73949b5e25fSSatish Balay ib = aj + *ai; 740831a3094SHong Zhang jmin = 0; 74198c9bda7SSatish Balay nonzerorow += (n>0); 7423d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 743831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 744831a3094SHong Zhang } 745eb1ec7c1SStefano Zampini if (aconj) { 746eb1ec7c1SStefano Zampini for (j=jmin; j<n; j++) { 747eb1ec7c1SStefano Zampini cval = *ib; 748eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 749eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 750eb1ec7c1SStefano Zampini } 751eb1ec7c1SStefano Zampini } else { 752831a3094SHong Zhang for (j=jmin; j<n; j++) { 75349b5e25fSSatish Balay cval = *ib; 75449b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 75549b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 75649b5e25fSSatish Balay } 757eb1ec7c1SStefano Zampini } 75849b5e25fSSatish Balay xb++; ai++; 75949b5e25fSSatish Balay } 76049b5e25fSSatish Balay 761d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 762bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 76349b5e25fSSatish Balay 764dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 76549b5e25fSSatish Balay PetscFunctionReturn(0); 76649b5e25fSSatish Balay } 76749b5e25fSSatish Balay 768dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 76949b5e25fSSatish Balay { 77049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 771d9ca1df4SBarry Smith PetscScalar *z,x1,x2; 772d9ca1df4SBarry Smith const PetscScalar *x,*xb; 773d9ca1df4SBarry Smith const MatScalar *v; 7746849ba73SBarry Smith PetscErrorCode ierr; 775d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 776d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 77798c9bda7SSatish Balay PetscInt nonzerorow=0; 77849b5e25fSSatish Balay 77949b5e25fSSatish Balay PetscFunctionBegin; 780343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 781c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 782d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7831ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 78449b5e25fSSatish Balay 78549b5e25fSSatish Balay v = a->a; 78649b5e25fSSatish Balay xb = x; 78749b5e25fSSatish Balay 78849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 78949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 79049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 79149b5e25fSSatish Balay ib = aj + *ai; 792831a3094SHong Zhang jmin = 0; 79398c9bda7SSatish Balay nonzerorow += (n>0); 7947fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 79549b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 79649b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 797831a3094SHong Zhang v += 4; jmin++; 7987fbae186SHong Zhang } 799444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 800444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 801831a3094SHong Zhang for (j=jmin; j<n; j++) { 80249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 80349b5e25fSSatish Balay cval = ib[j]*2; 80449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 80549b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 80649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 80749b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 80849b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 80949b5e25fSSatish Balay v += 4; 81049b5e25fSSatish Balay } 81149b5e25fSSatish Balay xb +=2; ai++; 81249b5e25fSSatish Balay } 813d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 814bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 81549b5e25fSSatish Balay 816dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 81749b5e25fSSatish Balay PetscFunctionReturn(0); 81849b5e25fSSatish Balay } 81949b5e25fSSatish Balay 820dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 82149b5e25fSSatish Balay { 82249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 823d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3; 824d9ca1df4SBarry Smith const PetscScalar *x,*xb; 825d9ca1df4SBarry Smith const MatScalar *v; 8266849ba73SBarry Smith PetscErrorCode ierr; 827d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 828d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 82998c9bda7SSatish Balay PetscInt nonzerorow=0; 83049b5e25fSSatish Balay 83149b5e25fSSatish Balay PetscFunctionBegin; 832343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 833c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 834d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8351ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 83649b5e25fSSatish Balay 83749b5e25fSSatish Balay v = a->a; 83849b5e25fSSatish Balay xb = x; 83949b5e25fSSatish Balay 84049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 84149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 84249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 84349b5e25fSSatish Balay ib = aj + *ai; 844831a3094SHong Zhang jmin = 0; 84598c9bda7SSatish Balay nonzerorow += (n>0); 8467fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 84749b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 84849b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 84949b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 850831a3094SHong Zhang v += 9; jmin++; 8517fbae186SHong Zhang } 852444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 853444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 854831a3094SHong Zhang for (j=jmin; j<n; j++) { 85549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 85649b5e25fSSatish Balay cval = ib[j]*3; 85749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 85849b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 85949b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 86049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 86149b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 86249b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 86349b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 86449b5e25fSSatish Balay v += 9; 86549b5e25fSSatish Balay } 86649b5e25fSSatish Balay xb +=3; ai++; 86749b5e25fSSatish Balay } 86849b5e25fSSatish Balay 869d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 870bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 87149b5e25fSSatish Balay 872dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 87349b5e25fSSatish Balay PetscFunctionReturn(0); 87449b5e25fSSatish Balay } 87549b5e25fSSatish Balay 876dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 87749b5e25fSSatish Balay { 87849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 879d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4; 880d9ca1df4SBarry Smith const PetscScalar *x,*xb; 881d9ca1df4SBarry Smith const MatScalar *v; 8826849ba73SBarry Smith PetscErrorCode ierr; 883d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 884d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 88598c9bda7SSatish Balay PetscInt nonzerorow=0; 88649b5e25fSSatish Balay 88749b5e25fSSatish Balay PetscFunctionBegin; 888343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 889c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 890d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8911ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 89249b5e25fSSatish Balay 89349b5e25fSSatish Balay v = a->a; 89449b5e25fSSatish Balay xb = x; 89549b5e25fSSatish Balay 89649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 89749b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 89849b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 89949b5e25fSSatish Balay ib = aj + *ai; 900831a3094SHong Zhang jmin = 0; 90198c9bda7SSatish Balay nonzerorow += (n>0); 9027fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 90349b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 90449b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 90549b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 90649b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 907831a3094SHong Zhang v += 16; jmin++; 9087fbae186SHong Zhang } 909444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 910444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 911831a3094SHong Zhang for (j=jmin; j<n; j++) { 91249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 91349b5e25fSSatish Balay cval = ib[j]*4; 91449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 91549b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 91649b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 91749b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 91849b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 91949b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 92049b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 92149b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 92249b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 92349b5e25fSSatish Balay v += 16; 92449b5e25fSSatish Balay } 92549b5e25fSSatish Balay xb +=4; ai++; 92649b5e25fSSatish Balay } 92749b5e25fSSatish Balay 928d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 929bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 93049b5e25fSSatish Balay 931dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 93249b5e25fSSatish Balay PetscFunctionReturn(0); 93349b5e25fSSatish Balay } 93449b5e25fSSatish Balay 935dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 93649b5e25fSSatish Balay { 93749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 938d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5; 939d9ca1df4SBarry Smith const PetscScalar *x,*xb; 940d9ca1df4SBarry Smith const MatScalar *v; 9416849ba73SBarry Smith PetscErrorCode ierr; 942d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 943d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 94498c9bda7SSatish Balay PetscInt nonzerorow=0; 94549b5e25fSSatish Balay 94649b5e25fSSatish Balay PetscFunctionBegin; 947343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 948c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 949d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9501ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 95149b5e25fSSatish Balay 95249b5e25fSSatish Balay v = a->a; 95349b5e25fSSatish Balay xb = x; 95449b5e25fSSatish Balay 95549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 95649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 95749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 95849b5e25fSSatish Balay ib = aj + *ai; 959831a3094SHong Zhang jmin = 0; 96098c9bda7SSatish Balay nonzerorow += (n>0); 9617fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 96249b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 96349b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 96449b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 96549b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 96649b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 967831a3094SHong Zhang v += 25; jmin++; 9687fbae186SHong Zhang } 969444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 970444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 971831a3094SHong Zhang for (j=jmin; j<n; j++) { 97249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 97349b5e25fSSatish Balay cval = ib[j]*5; 97449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 97549b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 97649b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 97749b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 97849b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 97949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 98049b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 98149b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 98249b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 98349b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 98449b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 98549b5e25fSSatish Balay v += 25; 98649b5e25fSSatish Balay } 98749b5e25fSSatish Balay xb +=5; ai++; 98849b5e25fSSatish Balay } 98949b5e25fSSatish Balay 990d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 991bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 99249b5e25fSSatish Balay 993dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 99449b5e25fSSatish Balay PetscFunctionReturn(0); 99549b5e25fSSatish Balay } 996c2916339SPierre Jolivet 997dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 99849b5e25fSSatish Balay { 99949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1000d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6; 1001d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1002d9ca1df4SBarry Smith const MatScalar *v; 10036849ba73SBarry Smith PetscErrorCode ierr; 1004d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1005d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 100698c9bda7SSatish Balay PetscInt nonzerorow=0; 100749b5e25fSSatish Balay 100849b5e25fSSatish Balay PetscFunctionBegin; 1009343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1010c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 1011d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10121ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 101349b5e25fSSatish Balay 101449b5e25fSSatish Balay v = a->a; 101549b5e25fSSatish Balay xb = x; 101649b5e25fSSatish Balay 101749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 101849b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 101949b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 102049b5e25fSSatish Balay ib = aj + *ai; 1021831a3094SHong Zhang jmin = 0; 102298c9bda7SSatish Balay nonzerorow += (n>0); 10237fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 102449b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 102549b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 102649b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 102749b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 102849b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 102949b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1030831a3094SHong Zhang v += 36; jmin++; 10317fbae186SHong Zhang } 1032444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1033444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1034831a3094SHong Zhang for (j=jmin; j<n; j++) { 103549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 103649b5e25fSSatish Balay cval = ib[j]*6; 103749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 103849b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 103949b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 104049b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 104149b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 104249b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 104349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 104449b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 104549b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 104649b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 104749b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 104849b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 104949b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 105049b5e25fSSatish Balay v += 36; 105149b5e25fSSatish Balay } 105249b5e25fSSatish Balay xb +=6; ai++; 105349b5e25fSSatish Balay } 105449b5e25fSSatish Balay 1055d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1056bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 105749b5e25fSSatish Balay 1058dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 105949b5e25fSSatish Balay PetscFunctionReturn(0); 106049b5e25fSSatish Balay } 106149b5e25fSSatish Balay 1062dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 106349b5e25fSSatish Balay { 106449b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1065d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1066d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1067d9ca1df4SBarry Smith const MatScalar *v; 10686849ba73SBarry Smith PetscErrorCode ierr; 1069d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1070d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 107198c9bda7SSatish Balay PetscInt nonzerorow=0; 107249b5e25fSSatish Balay 107349b5e25fSSatish Balay PetscFunctionBegin; 1074343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1075c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 1076d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10771ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 107849b5e25fSSatish Balay 107949b5e25fSSatish Balay v = a->a; 108049b5e25fSSatish Balay xb = x; 108149b5e25fSSatish Balay 108249b5e25fSSatish Balay for (i=0; i<mbs; i++) { 108349b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 108449b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 108549b5e25fSSatish Balay ib = aj + *ai; 1086831a3094SHong Zhang jmin = 0; 108798c9bda7SSatish Balay nonzerorow += (n>0); 10887fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 108949b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 109049b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 109149b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 109249b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 109349b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 109449b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 109549b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1096831a3094SHong Zhang v += 49; jmin++; 10977fbae186SHong Zhang } 1098444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1099444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1100831a3094SHong Zhang for (j=jmin; j<n; j++) { 110149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 110249b5e25fSSatish Balay cval = ib[j]*7; 110349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 110449b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 110549b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 110649b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 110749b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 110849b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 110949b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 111049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 111149b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 111249b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 111349b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 111449b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 111549b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 111649b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 111749b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 111849b5e25fSSatish Balay v += 49; 111949b5e25fSSatish Balay } 112049b5e25fSSatish Balay xb +=7; ai++; 112149b5e25fSSatish Balay } 112249b5e25fSSatish Balay 1123d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1124bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 112549b5e25fSSatish Balay 1126dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 112749b5e25fSSatish Balay PetscFunctionReturn(0); 112849b5e25fSSatish Balay } 112949b5e25fSSatish Balay 1130dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 113149b5e25fSSatish Balay { 113249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1133d9ca1df4SBarry Smith PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1134d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 1135d9ca1df4SBarry Smith const MatScalar *v; 1136dfbe8321SBarry Smith PetscErrorCode ierr; 1137d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1138d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 113998c9bda7SSatish Balay PetscInt nonzerorow=0; 114049b5e25fSSatish Balay 114149b5e25fSSatish Balay PetscFunctionBegin; 1142343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1143c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 1144d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 11451ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 114649b5e25fSSatish Balay 114749b5e25fSSatish Balay aj = a->j; 114849b5e25fSSatish Balay v = a->a; 114949b5e25fSSatish Balay ii = a->i; 115049b5e25fSSatish Balay 115149b5e25fSSatish Balay if (!a->mult_work) { 1152854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 115349b5e25fSSatish Balay } 115449b5e25fSSatish Balay work = a->mult_work; 115549b5e25fSSatish Balay 115649b5e25fSSatish Balay 115749b5e25fSSatish Balay for (i=0; i<mbs; i++) { 115849b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 115949b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 116098c9bda7SSatish Balay nonzerorow += (n>0); 116149b5e25fSSatish Balay 116249b5e25fSSatish Balay /* upper triangular part */ 116349b5e25fSSatish Balay for (j=0; j<n; j++) { 116449b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 116549b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 116649b5e25fSSatish Balay workt += bs; 116749b5e25fSSatish Balay } 116849b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 116996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 117049b5e25fSSatish Balay 117149b5e25fSSatish Balay /* strict lower triangular part */ 1172831a3094SHong Zhang idx = aj+ii[0]; 1173831a3094SHong Zhang if (*idx == i) { 117496b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1175831a3094SHong Zhang } 117649b5e25fSSatish Balay if (ncols > 0) { 117749b5e25fSSatish Balay workt = work; 1178580bdb30SBarry Smith ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 117996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1180831a3094SHong Zhang for (j=0; j<n; j++) { 1181831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 118249b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 118349b5e25fSSatish Balay workt += bs; 118449b5e25fSSatish Balay } 118549b5e25fSSatish Balay } 118649b5e25fSSatish Balay 118749b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 118849b5e25fSSatish Balay } 118949b5e25fSSatish Balay 1190d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1191bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 119249b5e25fSSatish Balay 1193dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 119449b5e25fSSatish Balay PetscFunctionReturn(0); 119549b5e25fSSatish Balay } 119649b5e25fSSatish Balay 1197f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 119849b5e25fSSatish Balay { 119949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1200f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1201efee365bSSatish Balay PetscErrorCode ierr; 1202c5df96a5SBarry Smith PetscBLASInt one = 1,totalnz; 120349b5e25fSSatish Balay 120449b5e25fSSatish Balay PetscFunctionBegin; 1205c5df96a5SBarry Smith ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 12068b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1207efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 120849b5e25fSSatish Balay PetscFunctionReturn(0); 120949b5e25fSSatish Balay } 121049b5e25fSSatish Balay 1211dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 121249b5e25fSSatish Balay { 121349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1214d9ca1df4SBarry Smith const MatScalar *v = a->a; 121549b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1216d9ca1df4SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 12176849ba73SBarry Smith PetscErrorCode ierr; 1218d9ca1df4SBarry Smith const PetscInt *aj=a->j,*col; 121949b5e25fSSatish Balay 122049b5e25fSSatish Balay PetscFunctionBegin; 1221c40ae873SPierre Jolivet if (!a->nz) { 1222c40ae873SPierre Jolivet *norm = 0.0; 1223c40ae873SPierre Jolivet PetscFunctionReturn(0); 1224c40ae873SPierre Jolivet } 122549b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 122649b5e25fSSatish Balay for (k=0; k<mbs; k++) { 122749b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1228831a3094SHong Zhang col = aj + jmin; 1229831a3094SHong Zhang if (*col == k) { /* diagonal block */ 123049b5e25fSSatish Balay for (i=0; i<bs2; i++) { 123149b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 123249b5e25fSSatish Balay } 1233831a3094SHong Zhang jmin++; 1234831a3094SHong Zhang } 1235831a3094SHong Zhang for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 123649b5e25fSSatish Balay for (i=0; i<bs2; i++) { 123749b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 123849b5e25fSSatish Balay } 123949b5e25fSSatish Balay } 124049b5e25fSSatish Balay } 12418f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2*sum_off); 124251f70360SJed Brown ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 12430b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1244dcca6d9dSJed Brown ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 12450b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 12460b8dc8d2SHong Zhang il[0] = 0; 124749b5e25fSSatish Balay 124849b5e25fSSatish Balay *norm = 0.0; 124949b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 125049b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 125149b5e25fSSatish Balay /*-- col sum --*/ 125249b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1253831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1254831a3094SHong Zhang at step k */ 125549b5e25fSSatish Balay while (i<mbs) { 125649b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 125749b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 125849b5e25fSSatish Balay for (j=0; j<bs; j++) { 125949b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 126049b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 126149b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 126249b5e25fSSatish Balay } 126349b5e25fSSatish Balay } 126449b5e25fSSatish Balay /* update il, jl */ 1265831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1266831a3094SHong Zhang jmax = a->i[i+1]; 126749b5e25fSSatish Balay if (jmin < jmax) { 126849b5e25fSSatish Balay il[i] = jmin; 126949b5e25fSSatish Balay j = a->j[jmin]; 127049b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 127149b5e25fSSatish Balay } 127249b5e25fSSatish Balay i = nexti; 127349b5e25fSSatish Balay } 127449b5e25fSSatish Balay /*-- row sum --*/ 127549b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 127649b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 127749b5e25fSSatish Balay for (j=0; j<bs; j++) { 127849b5e25fSSatish Balay v = a->a + i*bs2 + j; 127949b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 12800b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 128149b5e25fSSatish Balay } 128249b5e25fSSatish Balay } 128349b5e25fSSatish Balay } 128449b5e25fSSatish Balay /* add k_th block row to il, jl */ 1285831a3094SHong Zhang col = aj+jmin; 1286831a3094SHong Zhang if (*col == k) jmin++; 128749b5e25fSSatish Balay if (jmin < jmax) { 128849b5e25fSSatish Balay il[k] = jmin; 12890b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 129049b5e25fSSatish Balay } 129149b5e25fSSatish Balay for (j=0; j<bs; j++) { 129249b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 129349b5e25fSSatish Balay } 129449b5e25fSSatish Balay } 129574ed9c26SBarry Smith ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 129651f70360SJed Brown ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1297f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 129849b5e25fSSatish Balay PetscFunctionReturn(0); 129949b5e25fSSatish Balay } 130049b5e25fSSatish Balay 1301ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 130249b5e25fSSatish Balay { 130349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1304dfbe8321SBarry Smith PetscErrorCode ierr; 130549b5e25fSSatish Balay 130649b5e25fSSatish Balay PetscFunctionBegin; 130749b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1308d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1309ef511fbeSHong Zhang *flg = PETSC_FALSE; 1310ef511fbeSHong Zhang PetscFunctionReturn(0); 131149b5e25fSSatish Balay } 131249b5e25fSSatish Balay 131349b5e25fSSatish Balay /* if the a->i are the same */ 1314580bdb30SBarry Smith ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 131526fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 131649b5e25fSSatish Balay 131749b5e25fSSatish Balay /* if a->j are the same */ 1318580bdb30SBarry Smith ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 131926fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 132026fbe8dcSKarl Rupp 132149b5e25fSSatish Balay /* if a->a are the same */ 1322580bdb30SBarry Smith ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr); 1323935af2e7SHong Zhang PetscFunctionReturn(0); 132449b5e25fSSatish Balay } 132549b5e25fSSatish Balay 1326dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 132749b5e25fSSatish Balay { 132849b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1329dfbe8321SBarry Smith PetscErrorCode ierr; 1330d9ca1df4SBarry Smith PetscInt i,j,k,row,bs,ambs,bs2; 1331d9ca1df4SBarry Smith const PetscInt *ai,*aj; 133287828ca2SBarry Smith PetscScalar *x,zero = 0.0; 1333d9ca1df4SBarry Smith const MatScalar *aa,*aa_j; 133449b5e25fSSatish Balay 133549b5e25fSSatish Balay PetscFunctionBegin; 1336d0f46423SBarry Smith bs = A->rmap->bs; 1337e32f2f54SBarry Smith if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 133882799104SHong Zhang 133949b5e25fSSatish Balay aa = a->a; 13408a0c6efdSHong Zhang ambs = a->mbs; 13418a0c6efdSHong Zhang 13428a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13438a0c6efdSHong Zhang PetscInt *diag=a->diag; 13448a0c6efdSHong Zhang aa = a->a; 13458a0c6efdSHong Zhang ambs = a->mbs; 13468a0c6efdSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 13478a0c6efdSHong Zhang for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 13488a0c6efdSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 13498a0c6efdSHong Zhang PetscFunctionReturn(0); 13508a0c6efdSHong Zhang } 13518a0c6efdSHong Zhang 135249b5e25fSSatish Balay ai = a->i; 135349b5e25fSSatish Balay aj = a->j; 135449b5e25fSSatish Balay bs2 = a->bs2; 13552dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 1356c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 13571ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 135849b5e25fSSatish Balay for (i=0; i<ambs; i++) { 135949b5e25fSSatish Balay j=ai[i]; 136049b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 136149b5e25fSSatish Balay row = i*bs; 136249b5e25fSSatish Balay aa_j = aa + j*bs2; 136349b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 136449b5e25fSSatish Balay } 136549b5e25fSSatish Balay } 13661ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 136749b5e25fSSatish Balay PetscFunctionReturn(0); 136849b5e25fSSatish Balay } 136949b5e25fSSatish Balay 1370dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 137149b5e25fSSatish Balay { 137249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1373d9ca1df4SBarry Smith PetscScalar x; 1374d9ca1df4SBarry Smith const PetscScalar *l,*li,*ri; 137549b5e25fSSatish Balay MatScalar *aa,*v; 1376dfbe8321SBarry Smith PetscErrorCode ierr; 1377fff8e43fSBarry Smith PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2; 1378fff8e43fSBarry Smith const PetscInt *ai,*aj; 1379ace3abfcSBarry Smith PetscBool flg; 138049b5e25fSSatish Balay 138149b5e25fSSatish Balay PetscFunctionBegin; 1382b3bf805bSHong Zhang if (ll != rr) { 1383b3bf805bSHong Zhang ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1384e7e72b3dSBarry Smith if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1385b3bf805bSHong Zhang } 1386b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 138749b5e25fSSatish Balay ai = a->i; 138849b5e25fSSatish Balay aj = a->j; 138949b5e25fSSatish Balay aa = a->a; 1390d0f46423SBarry Smith m = A->rmap->N; 1391d0f46423SBarry Smith bs = A->rmap->bs; 139249b5e25fSSatish Balay mbs = a->mbs; 139349b5e25fSSatish Balay bs2 = a->bs2; 139449b5e25fSSatish Balay 1395d9ca1df4SBarry Smith ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 139649b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1397e32f2f54SBarry Smith if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 139849b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 139949b5e25fSSatish Balay M = ai[i+1] - ai[i]; 140049b5e25fSSatish Balay li = l + i*bs; 140149b5e25fSSatish Balay v = aa + bs2*ai[i]; 140249b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 140349b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 14045e90f9d9SHong Zhang for (k=0; k<bs; k++) { 140549b5e25fSSatish Balay x = ri[k]; 140649b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 140749b5e25fSSatish Balay } 140849b5e25fSSatish Balay } 140949b5e25fSSatish Balay } 1410d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1411dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 141249b5e25fSSatish Balay PetscFunctionReturn(0); 141349b5e25fSSatish Balay } 141449b5e25fSSatish Balay 1415dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 141649b5e25fSSatish Balay { 141749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 141849b5e25fSSatish Balay 141949b5e25fSSatish Balay PetscFunctionBegin; 142049b5e25fSSatish Balay info->block_size = a->bs2; 1421ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 14226c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 14233966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 142449b5e25fSSatish Balay info->assemblies = A->num_ass; 14258e58a170SBarry Smith info->mallocs = A->info.mallocs; 14267adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1427d5f3da31SBarry Smith if (A->factortype) { 142849b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 142949b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 143049b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 143149b5e25fSSatish Balay } else { 143249b5e25fSSatish Balay info->fill_ratio_given = 0; 143349b5e25fSSatish Balay info->fill_ratio_needed = 0; 143449b5e25fSSatish Balay info->factor_mallocs = 0; 143549b5e25fSSatish Balay } 143649b5e25fSSatish Balay PetscFunctionReturn(0); 143749b5e25fSSatish Balay } 143849b5e25fSSatish Balay 1439dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 144049b5e25fSSatish Balay { 144149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1442dfbe8321SBarry Smith PetscErrorCode ierr; 144349b5e25fSSatish Balay 144449b5e25fSSatish Balay PetscFunctionBegin; 1445580bdb30SBarry Smith ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 144649b5e25fSSatish Balay PetscFunctionReturn(0); 144749b5e25fSSatish Balay } 1448dc354874SHong Zhang 1449985db425SBarry Smith /* 1450985db425SBarry Smith This code does not work since it only checks the upper triangular part of 1451985db425SBarry Smith the matrix. Hence it is not listed in the function table. 1452985db425SBarry Smith */ 1453985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1454dc354874SHong Zhang { 1455dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1456dfbe8321SBarry Smith PetscErrorCode ierr; 1457d9ca1df4SBarry Smith PetscInt i,j,n,row,col,bs,mbs; 1458d9ca1df4SBarry Smith const PetscInt *ai,*aj; 1459c3fca9a7SHong Zhang PetscReal atmp; 1460d9ca1df4SBarry Smith const MatScalar *aa; 1461985db425SBarry Smith PetscScalar *x; 146213f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1463dc354874SHong Zhang 1464dc354874SHong Zhang PetscFunctionBegin; 1465e32f2f54SBarry Smith if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1466e32f2f54SBarry Smith if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1467d0f46423SBarry Smith bs = A->rmap->bs; 1468dc354874SHong Zhang aa = a->a; 1469dc354874SHong Zhang ai = a->i; 1470dc354874SHong Zhang aj = a->j; 147144117c81SHong Zhang mbs = a->mbs; 1472dc354874SHong Zhang 1473985db425SBarry Smith ierr = VecSet(v,0.0);CHKERRQ(ierr); 14741ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1475dc354874SHong Zhang ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1476e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 147744117c81SHong Zhang for (i=0; i<mbs; i++) { 1478d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1479d0f6400bSHong Zhang brow = bs*i; 148044117c81SHong Zhang for (j=0; j<ncols; j++) { 1481d0f6400bSHong Zhang bcol = bs*(*aj); 148244117c81SHong Zhang for (kcol=0; kcol<bs; kcol++) { 1483d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 148444117c81SHong Zhang for (krow=0; krow<bs; krow++) { 1485d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1486d0f6400bSHong Zhang row = brow + krow; /* row index */ 1487c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1488c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 148944117c81SHong Zhang } 149044117c81SHong Zhang } 1491d0f6400bSHong Zhang aj++; 1492dc354874SHong Zhang } 1493dc354874SHong Zhang } 14941ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1495dc354874SHong Zhang PetscFunctionReturn(0); 1496dc354874SHong Zhang } 1497c2916339SPierre Jolivet 14984222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1499c2916339SPierre Jolivet { 1500c2916339SPierre Jolivet PetscErrorCode ierr; 1501c2916339SPierre Jolivet 1502c2916339SPierre Jolivet PetscFunctionBegin; 1503c2916339SPierre Jolivet ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 15044222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1505c2916339SPierre Jolivet PetscFunctionReturn(0); 1506c2916339SPierre Jolivet } 1507c2916339SPierre Jolivet 1508c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1509c2916339SPierre Jolivet { 1510c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1511c2916339SPierre Jolivet PetscScalar *z = c; 1512c2916339SPierre Jolivet const PetscScalar *xb; 1513c2916339SPierre Jolivet PetscScalar x1; 1514c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1515c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1516*3c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 1517*3c2e41e1SStefano Zampini const int aconj = A->hermitian; 1518*3c2e41e1SStefano Zampini #else 1519*3c2e41e1SStefano Zampini const int aconj = 0; 1520*3c2e41e1SStefano Zampini #endif 1521c2916339SPierre Jolivet 1522c2916339SPierre Jolivet PetscFunctionBegin; 1523c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1524c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1525c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1526c2916339SPierre Jolivet PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1527c2916339SPierre Jolivet jj = idx; 1528c2916339SPierre Jolivet vv = v; 1529c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1530c2916339SPierre Jolivet idx = jj; 1531c2916339SPierre Jolivet v = vv; 1532c2916339SPierre Jolivet for (j=0; j<n; j++) { 1533c2916339SPierre Jolivet xb = b + (*idx); x1 = xb[0+k*bm]; 1534c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1; 1535*3c2e41e1SStefano Zampini if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm]; 1536c2916339SPierre Jolivet v += 1; 1537c2916339SPierre Jolivet ++idx; 1538c2916339SPierre Jolivet } 1539c2916339SPierre Jolivet } 1540c2916339SPierre Jolivet z += 1; 1541c2916339SPierre Jolivet } 1542c2916339SPierre Jolivet PetscFunctionReturn(0); 1543c2916339SPierre Jolivet } 1544c2916339SPierre Jolivet 1545c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1546c2916339SPierre Jolivet { 1547c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1548c2916339SPierre Jolivet PetscScalar *z = c; 1549c2916339SPierre Jolivet const PetscScalar *xb; 1550c2916339SPierre Jolivet PetscScalar x1,x2; 1551c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1552c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1553c2916339SPierre Jolivet 1554c2916339SPierre Jolivet PetscFunctionBegin; 1555c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1556c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1557c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1558c2916339SPierre Jolivet PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1559c2916339SPierre Jolivet jj = idx; 1560c2916339SPierre Jolivet vv = v; 1561c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1562c2916339SPierre Jolivet idx = jj; 1563c2916339SPierre Jolivet v = vv; 1564c2916339SPierre Jolivet for (j=0; j<n; j++) { 1565c2916339SPierre Jolivet xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 1566c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[2]*x2; 1567c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[3]*x2; 1568c2916339SPierre Jolivet if (*idx != i) { 1569c2916339SPierre Jolivet c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm]; 1570c2916339SPierre Jolivet c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm]; 1571c2916339SPierre Jolivet } 1572c2916339SPierre Jolivet v += 4; 1573c2916339SPierre Jolivet ++idx; 1574c2916339SPierre Jolivet } 1575c2916339SPierre Jolivet } 1576c2916339SPierre Jolivet z += 2; 1577c2916339SPierre Jolivet } 1578c2916339SPierre Jolivet PetscFunctionReturn(0); 1579c2916339SPierre Jolivet } 1580c2916339SPierre Jolivet 1581c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1582c2916339SPierre Jolivet { 1583c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1584c2916339SPierre Jolivet PetscScalar *z = c; 1585c2916339SPierre Jolivet const PetscScalar *xb; 1586c2916339SPierre Jolivet PetscScalar x1,x2,x3; 1587c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1588c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1589c2916339SPierre Jolivet 1590c2916339SPierre Jolivet PetscFunctionBegin; 1591c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1592c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1593c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1594c2916339SPierre Jolivet PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1595c2916339SPierre Jolivet jj = idx; 1596c2916339SPierre Jolivet vv = v; 1597c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1598c2916339SPierre Jolivet idx = jj; 1599c2916339SPierre Jolivet v = vv; 1600c2916339SPierre Jolivet for (j=0; j<n; j++) { 1601c2916339SPierre Jolivet xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 1602c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3; 1603c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3; 1604c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3; 1605c2916339SPierre Jolivet if (*idx != i) { 1606c2916339SPierre Jolivet c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm]; 1607c2916339SPierre Jolivet c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm]; 1608c2916339SPierre Jolivet c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm]; 1609c2916339SPierre Jolivet } 1610c2916339SPierre Jolivet v += 9; 1611c2916339SPierre Jolivet ++idx; 1612c2916339SPierre Jolivet } 1613c2916339SPierre Jolivet } 1614c2916339SPierre Jolivet z += 3; 1615c2916339SPierre Jolivet } 1616c2916339SPierre Jolivet PetscFunctionReturn(0); 1617c2916339SPierre Jolivet } 1618c2916339SPierre Jolivet 1619c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1620c2916339SPierre Jolivet { 1621c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1622c2916339SPierre Jolivet PetscScalar *z = c; 1623c2916339SPierre Jolivet const PetscScalar *xb; 1624c2916339SPierre Jolivet PetscScalar x1,x2,x3,x4; 1625c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1626c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1627c2916339SPierre Jolivet 1628c2916339SPierre Jolivet PetscFunctionBegin; 1629c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1630c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1631c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1632c2916339SPierre Jolivet PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1633c2916339SPierre Jolivet jj = idx; 1634c2916339SPierre Jolivet vv = v; 1635c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1636c2916339SPierre Jolivet idx = jj; 1637c2916339SPierre Jolivet v = vv; 1638c2916339SPierre Jolivet for (j=0; j<n; j++) { 1639c2916339SPierre Jolivet xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 1640c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1641c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1642c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1643c2916339SPierre Jolivet z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1644c2916339SPierre Jolivet if (*idx != i) { 1645c2916339SPierre Jolivet c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm]; 1646c2916339SPierre Jolivet c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm]; 1647c2916339SPierre Jolivet c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm]; 1648c2916339SPierre Jolivet c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm]; 1649c2916339SPierre Jolivet } 1650c2916339SPierre Jolivet v += 16; 1651c2916339SPierre Jolivet ++idx; 1652c2916339SPierre Jolivet } 1653c2916339SPierre Jolivet } 1654c2916339SPierre Jolivet z += 4; 1655c2916339SPierre Jolivet } 1656c2916339SPierre Jolivet PetscFunctionReturn(0); 1657c2916339SPierre Jolivet } 1658c2916339SPierre Jolivet 1659c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1660c2916339SPierre Jolivet { 1661c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1662c2916339SPierre Jolivet PetscScalar *z = c; 1663c2916339SPierre Jolivet const PetscScalar *xb; 1664c2916339SPierre Jolivet PetscScalar x1,x2,x3,x4,x5; 1665c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1666c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1667c2916339SPierre Jolivet 1668c2916339SPierre Jolivet PetscFunctionBegin; 1669c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1670c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1671c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1672c2916339SPierre Jolivet PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1673c2916339SPierre Jolivet jj = idx; 1674c2916339SPierre Jolivet vv = v; 1675c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1676c2916339SPierre Jolivet idx = jj; 1677c2916339SPierre Jolivet v = vv; 1678c2916339SPierre Jolivet for (j=0; j<n; j++) { 1679c2916339SPierre Jolivet xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm]; 1680c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1681c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1682c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1683c2916339SPierre Jolivet z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1684c2916339SPierre Jolivet z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1685c2916339SPierre Jolivet if (*idx != i) { 1686c2916339SPierre Jolivet c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm]; 1687c2916339SPierre Jolivet c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm]; 1688c2916339SPierre Jolivet c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm]; 1689c2916339SPierre Jolivet c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm]; 1690c2916339SPierre Jolivet c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm]; 1691c2916339SPierre Jolivet } 1692c2916339SPierre Jolivet v += 25; 1693c2916339SPierre Jolivet ++idx; 1694c2916339SPierre Jolivet } 1695c2916339SPierre Jolivet } 1696c2916339SPierre Jolivet z += 5; 1697c2916339SPierre Jolivet } 1698c2916339SPierre Jolivet PetscFunctionReturn(0); 1699c2916339SPierre Jolivet } 1700c2916339SPierre Jolivet 1701c2916339SPierre Jolivet PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C) 1702c2916339SPierre Jolivet { 1703c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1704c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1705c2916339SPierre Jolivet Mat_SeqDense *cd = (Mat_SeqDense*)B->data; 1706c2916339SPierre Jolivet PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 1707c2916339SPierre Jolivet PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1708c2916339SPierre Jolivet PetscBLASInt bbs,bcn,bbm,bcm; 1709c2916339SPierre Jolivet PetscScalar *z = 0; 1710c2916339SPierre Jolivet PetscScalar *c,*b; 1711c2916339SPierre Jolivet const MatScalar *v; 1712c2916339SPierre Jolivet const PetscInt *idx,*ii; 1713c2916339SPierre Jolivet PetscScalar _DOne=1.0; 1714c2916339SPierre Jolivet PetscErrorCode ierr; 1715c2916339SPierre Jolivet 1716c2916339SPierre Jolivet PetscFunctionBegin; 1717c2916339SPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 1718c2916339SPierre Jolivet if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n); 1719c2916339SPierre Jolivet if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n); 1720c2916339SPierre Jolivet if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n); 1721c2916339SPierre Jolivet b = bd->v; 1722c2916339SPierre Jolivet ierr = MatZeroEntries(C);CHKERRQ(ierr); 1723c2916339SPierre Jolivet ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1724c2916339SPierre Jolivet switch (bs) { 1725c2916339SPierre Jolivet case 1: 1726c2916339SPierre Jolivet ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn); 1727c2916339SPierre Jolivet break; 1728c2916339SPierre Jolivet case 2: 1729c2916339SPierre Jolivet ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn); 1730c2916339SPierre Jolivet break; 1731c2916339SPierre Jolivet case 3: 1732c2916339SPierre Jolivet ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn); 1733c2916339SPierre Jolivet break; 1734c2916339SPierre Jolivet case 4: 1735c2916339SPierre Jolivet ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn); 1736c2916339SPierre Jolivet break; 1737c2916339SPierre Jolivet case 5: 1738c2916339SPierre Jolivet ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn); 1739c2916339SPierre Jolivet break; 1740c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1741c2916339SPierre Jolivet ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 1742c2916339SPierre Jolivet ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 1743c2916339SPierre Jolivet ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 1744c2916339SPierre Jolivet ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 1745c2916339SPierre Jolivet idx = a->j; 1746c2916339SPierre Jolivet v = a->a; 1747c2916339SPierre Jolivet mbs = a->mbs; 1748c2916339SPierre Jolivet ii = a->i; 1749c2916339SPierre Jolivet z = c; 1750c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1751c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1752c2916339SPierre Jolivet for (j=0; j<n; j++) { 1753c2916339SPierre Jolivet if (*idx != i) 1754c2916339SPierre Jolivet PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm)); 1755c2916339SPierre Jolivet PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 1756c2916339SPierre Jolivet v += bs2; 1757c2916339SPierre Jolivet } 1758c2916339SPierre Jolivet z += bs; 1759c2916339SPierre Jolivet } 1760c2916339SPierre Jolivet } 1761c2916339SPierre Jolivet ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1762c2916339SPierre Jolivet ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1763c2916339SPierre Jolivet ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1764c2916339SPierre Jolivet ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr); 1765c2916339SPierre Jolivet PetscFunctionReturn(0); 1766c2916339SPierre Jolivet } 1767