149b5e25fSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 3c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h> 4c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h> 5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 6c6db04a5SJed Brown #include <petscbt.h> 7c6db04a5SJed Brown #include <petscblaslapack.h> 849b5e25fSSatish Balay 913f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 1049b5e25fSSatish Balay { 115eee224dSHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 126849ba73SBarry Smith PetscErrorCode ierr; 135d0c19d7SBarry Smith PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 145d0c19d7SBarry Smith const PetscInt *idx; 15db41cccfSHong Zhang PetscBT table_out,table_in; 16d94109b8SHong Zhang 17d94109b8SHong Zhang PetscFunctionBegin; 18*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(ov < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 19d94109b8SHong Zhang mbs = a->mbs; 20d94109b8SHong Zhang ai = a->i; 21d94109b8SHong Zhang aj = a->j; 22d0f46423SBarry Smith bs = A->rmap->bs; 2353b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 24854ce69bSBarry Smith ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 25854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 2653b8de81SBarry Smith ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 27d94109b8SHong Zhang 28d94109b8SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 29d94109b8SHong Zhang isz = 0; 30db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 31d94109b8SHong Zhang 32d94109b8SHong Zhang /* Extract the indices, assume there can be duplicate entries */ 33d94109b8SHong Zhang ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 34d94109b8SHong Zhang ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 35d94109b8SHong Zhang 36db41cccfSHong Zhang /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 37dbe03f88SHong Zhang bcol_max = 0; 38d94109b8SHong Zhang for (j=0; j<n; ++j) { 39d94109b8SHong Zhang brow = idx[j]/bs; /* convert the indices into block indices */ 40*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(brow >= mbs,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41db41cccfSHong Zhang if (!PetscBTLookupSet(table_out,brow)) { 42dbe03f88SHong Zhang nidx[isz++] = brow; 43dbe03f88SHong Zhang if (bcol_max < brow) bcol_max = brow; 44dbe03f88SHong Zhang } 45d94109b8SHong Zhang } 46d94109b8SHong Zhang ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 476bf464f9SBarry Smith ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 48d94109b8SHong Zhang 49d94109b8SHong Zhang k = 0; 50d94109b8SHong Zhang for (j=0; j<ov; j++) { /* for each overlap */ 51db41cccfSHong Zhang /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 52db41cccfSHong Zhang ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 53db41cccfSHong Zhang for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 54d94109b8SHong Zhang 55d94109b8SHong Zhang n = isz; /* length of the updated is[i] */ 56d94109b8SHong Zhang for (brow=0; brow<mbs; brow++) { 57d94109b8SHong Zhang start = ai[brow]; end = ai[brow+1]; 58db41cccfSHong Zhang if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 59d94109b8SHong Zhang for (l = start; l<end; l++) { 60d94109b8SHong Zhang bcol = aj[l]; 61d7b97159SHong Zhang if (!PetscBTLookupSet(table_out,bcol)) { 62d7b97159SHong Zhang nidx[isz++] = bcol; 63d7b97159SHong Zhang if (bcol_max < bcol) bcol_max = bcol; 64d7b97159SHong Zhang } 65d94109b8SHong Zhang } 66d94109b8SHong Zhang k++; 67d94109b8SHong Zhang if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 68d94109b8SHong Zhang } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 69d94109b8SHong Zhang for (l = start; l<end; l++) { 70d94109b8SHong Zhang bcol = aj[l]; 71dbe03f88SHong Zhang if (bcol > bcol_max) break; 72db41cccfSHong Zhang if (PetscBTLookup(table_in,bcol)) { 7326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 74d94109b8SHong Zhang break; /* for l = start; l<end ; l++) */ 75d94109b8SHong Zhang } 76d94109b8SHong Zhang } 77d94109b8SHong Zhang } 78d94109b8SHong Zhang } 79d94109b8SHong Zhang } /* for each overlap */ 80d94109b8SHong Zhang 81d94109b8SHong Zhang /* expand the Index Set */ 82d94109b8SHong Zhang for (j=0; j<isz; j++) { 8326fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 84d94109b8SHong Zhang } 8570b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 86d94109b8SHong Zhang } 8794bacf5dSBarry Smith ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 88d94109b8SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 89d94109b8SHong Zhang ierr = PetscFree(nidx2);CHKERRQ(ierr); 9094bacf5dSBarry Smith ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 915eee224dSHong Zhang PetscFunctionReturn(0); 9249b5e25fSSatish Balay } 9349b5e25fSSatish Balay 94847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 95847daeccSHong Zhang Zero some ops' to avoid invalid usse */ 96847daeccSHong Zhang PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 9749b5e25fSSatish Balay { 986849ba73SBarry Smith PetscErrorCode ierr; 9949b5e25fSSatish Balay 10049b5e25fSSatish Balay PetscFunctionBegin; 101847daeccSHong Zhang ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 102f4259b30SLisandro Dalcin Bseq->ops->mult = NULL; 103f4259b30SLisandro Dalcin Bseq->ops->multadd = NULL; 104f4259b30SLisandro Dalcin Bseq->ops->multtranspose = NULL; 105f4259b30SLisandro Dalcin Bseq->ops->multtransposeadd = NULL; 106f4259b30SLisandro Dalcin Bseq->ops->lufactor = NULL; 107f4259b30SLisandro Dalcin Bseq->ops->choleskyfactor = NULL; 108f4259b30SLisandro Dalcin Bseq->ops->lufactorsymbolic = NULL; 109f4259b30SLisandro Dalcin Bseq->ops->choleskyfactorsymbolic = NULL; 110f4259b30SLisandro Dalcin Bseq->ops->getinertia = NULL; 11149b5e25fSSatish Balay PetscFunctionReturn(0); 11249b5e25fSSatish Balay } 11349b5e25fSSatish Balay 1147dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 1157dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 116e50a8114SHong Zhang { 117e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 118e50a8114SHong Zhang PetscErrorCode ierr; 119e50a8114SHong Zhang PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 120e50a8114SHong Zhang PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 121e50a8114SHong Zhang const PetscInt *irow,*icol; 122e50a8114SHong Zhang PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 123e50a8114SHong Zhang PetscInt *aj = a->j,*ai = a->i; 124e50a8114SHong Zhang MatScalar *mat_a; 125e50a8114SHong Zhang Mat C; 126e50a8114SHong Zhang PetscBool flag; 127e50a8114SHong Zhang 128e50a8114SHong Zhang PetscFunctionBegin; 129e50a8114SHong Zhang 130e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 131e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 132e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 133e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 134e50a8114SHong Zhang 135e50a8114SHong Zhang ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 136e50a8114SHong Zhang ssmap = smap; 137e50a8114SHong Zhang ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 138e50a8114SHong Zhang for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 139e50a8114SHong Zhang /* determine lens of each row */ 140e50a8114SHong Zhang for (i=0; i<nrows; i++) { 141e50a8114SHong Zhang kstart = ai[irow[i]]; 142e50a8114SHong Zhang kend = kstart + a->ilen[irow[i]]; 143e50a8114SHong Zhang lens[i] = 0; 144e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 145e50a8114SHong Zhang if (ssmap[aj[k]]) lens[i]++; 146e50a8114SHong Zhang } 147e50a8114SHong Zhang } 148e50a8114SHong Zhang /* Create and fill new matrix */ 149e50a8114SHong Zhang if (scall == MAT_REUSE_MATRIX) { 150e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)((*B)->data); 151e50a8114SHong Zhang 152*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 153580bdb30SBarry Smith ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr); 154*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 155580bdb30SBarry Smith ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr); 156e50a8114SHong Zhang C = *B; 157e50a8114SHong Zhang } else { 158e50a8114SHong Zhang ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 159e50a8114SHong Zhang ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 160e50a8114SHong Zhang ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 161367daffbSBarry Smith ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 162e50a8114SHong Zhang } 163e50a8114SHong Zhang c = (Mat_SeqSBAIJ*)(C->data); 164e50a8114SHong Zhang for (i=0; i<nrows; i++) { 165e50a8114SHong Zhang row = irow[i]; 166e50a8114SHong Zhang kstart = ai[row]; 167e50a8114SHong Zhang kend = kstart + a->ilen[row]; 168e50a8114SHong Zhang mat_i = c->i[i]; 169e50a8114SHong Zhang mat_j = c->j + mat_i; 170e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 171e50a8114SHong Zhang mat_ilen = c->ilen + i; 172e50a8114SHong Zhang for (k=kstart; k<kend; k++) { 173e50a8114SHong Zhang if ((tcol=ssmap[a->j[k]])) { 174e50a8114SHong Zhang *mat_j++ = tcol - 1; 175580bdb30SBarry Smith ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr); 176e50a8114SHong Zhang mat_a += bs2; 177e50a8114SHong Zhang (*mat_ilen)++; 178e50a8114SHong Zhang } 179e50a8114SHong Zhang } 180e50a8114SHong Zhang } 181e50a8114SHong Zhang /* sort */ 182e50a8114SHong Zhang { 183e50a8114SHong Zhang MatScalar *work; 184e50a8114SHong Zhang 185e50a8114SHong Zhang ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 186e50a8114SHong Zhang for (i=0; i<nrows; i++) { 187e50a8114SHong Zhang PetscInt ilen; 188e50a8114SHong Zhang mat_i = c->i[i]; 189e50a8114SHong Zhang mat_j = c->j + mat_i; 190e50a8114SHong Zhang mat_a = c->a + mat_i*bs2; 191e50a8114SHong Zhang ilen = c->ilen[i]; 192e50a8114SHong Zhang ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 193e50a8114SHong Zhang } 194e50a8114SHong Zhang ierr = PetscFree(work);CHKERRQ(ierr); 195e50a8114SHong Zhang } 196e50a8114SHong Zhang 197e50a8114SHong Zhang /* Free work space */ 198e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 199e50a8114SHong Zhang ierr = PetscFree(smap);CHKERRQ(ierr); 200e50a8114SHong Zhang ierr = PetscFree(lens);CHKERRQ(ierr); 201e50a8114SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 202e50a8114SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 203e50a8114SHong Zhang 204e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205e50a8114SHong Zhang *B = C; 206e50a8114SHong Zhang PetscFunctionReturn(0); 207e50a8114SHong Zhang } 208e50a8114SHong Zhang 2097dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 21049b5e25fSSatish Balay { 211e50a8114SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 212e50a8114SHong Zhang IS is1,is2; 2136849ba73SBarry Smith PetscErrorCode ierr; 214e50a8114SHong Zhang PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 215e50a8114SHong Zhang const PetscInt *irow,*icol; 21649b5e25fSSatish Balay 21749b5e25fSSatish Balay PetscFunctionBegin; 218e50a8114SHong Zhang ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 219e50a8114SHong Zhang ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 220e50a8114SHong Zhang ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 221e50a8114SHong Zhang ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 222e50a8114SHong Zhang 223e50a8114SHong Zhang /* Verify if the indices corespond to each element in a block 224e50a8114SHong Zhang and form the IS with compressed IS */ 225e50a8114SHong Zhang maxmnbs = PetscMax(a->mbs,a->nbs); 226e50a8114SHong Zhang ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 227580bdb30SBarry Smith ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr); 228e50a8114SHong Zhang for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 229e50a8114SHong Zhang for (i=0; i<a->mbs; i++) { 230*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(vary[i]!=0 && vary[i]!=bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 231e50a8114SHong Zhang } 232e50a8114SHong Zhang count = 0; 233e50a8114SHong Zhang for (i=0; i<nrows; i++) { 234e50a8114SHong Zhang PetscInt j = irow[i] / bs; 235e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 236e50a8114SHong Zhang } 237e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 238e50a8114SHong Zhang 239580bdb30SBarry Smith ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr); 240e50a8114SHong Zhang for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 241e50a8114SHong Zhang for (i=0; i<a->nbs; i++) { 242*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(vary[i]!=0 && vary[i]!=bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 243e50a8114SHong Zhang } 244e50a8114SHong Zhang count = 0; 245e50a8114SHong Zhang for (i=0; i<ncols; i++) { 246e50a8114SHong Zhang PetscInt j = icol[i] / bs; 247e50a8114SHong Zhang if ((vary[j]--)==bs) iary[count++] = j; 248e50a8114SHong Zhang } 249e50a8114SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 250e50a8114SHong Zhang ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 251e50a8114SHong Zhang ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 252e50a8114SHong Zhang ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 253e50a8114SHong Zhang 2547dae84e0SHong Zhang ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 255e50a8114SHong Zhang ierr = ISDestroy(&is1);CHKERRQ(ierr); 256e50a8114SHong Zhang ierr = ISDestroy(&is2);CHKERRQ(ierr); 257847daeccSHong Zhang 2588f46ffcaSHong Zhang if (isrow != iscol) { 2598f46ffcaSHong Zhang PetscBool isequal; 2608f46ffcaSHong Zhang ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 261847daeccSHong Zhang if (!isequal) { 262847daeccSHong Zhang ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 2638f46ffcaSHong Zhang } 26449b5e25fSSatish Balay } 26549b5e25fSSatish Balay PetscFunctionReturn(0); 26649b5e25fSSatish Balay } 26749b5e25fSSatish Balay 2687dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 26949b5e25fSSatish Balay { 2706849ba73SBarry Smith PetscErrorCode ierr; 27113f74950SBarry Smith PetscInt i; 27249b5e25fSSatish Balay 27349b5e25fSSatish Balay PetscFunctionBegin; 274e50a8114SHong Zhang if (scall == MAT_INITIAL_MATRIX) { 275df750dc8SHong Zhang ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 276afebec48SHong Zhang } 277e50a8114SHong Zhang 278e50a8114SHong Zhang for (i=0; i<n; i++) { 2797dae84e0SHong Zhang ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 28049b5e25fSSatish Balay } 28149b5e25fSSatish Balay PetscFunctionReturn(0); 28249b5e25fSSatish Balay } 28349b5e25fSSatish Balay 28449b5e25fSSatish Balay /* -------------------------------------------------------*/ 28549b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 28649b5e25fSSatish Balay /* -------------------------------------------------------*/ 28749b5e25fSSatish Balay 288dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 28949b5e25fSSatish Balay { 29049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 291d9ca1df4SBarry Smith PetscScalar *z,x1,x2,zero=0.0; 292d9ca1df4SBarry Smith const PetscScalar *x,*xb; 293d9ca1df4SBarry Smith const MatScalar *v; 2946849ba73SBarry Smith PetscErrorCode ierr; 295d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 296d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 29798c9bda7SSatish Balay PetscInt nonzerorow=0; 29849b5e25fSSatish Balay 29949b5e25fSSatish Balay PetscFunctionBegin; 3002dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 301c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 302d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3031ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 30449b5e25fSSatish Balay 30549b5e25fSSatish Balay v = a->a; 30649b5e25fSSatish Balay xb = x; 30749b5e25fSSatish Balay 30849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 30949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 31049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 31149b5e25fSSatish Balay ib = aj + *ai; 312831a3094SHong Zhang jmin = 0; 31398c9bda7SSatish Balay nonzerorow += (n>0); 3147fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 31549b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 31649b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 317831a3094SHong Zhang v += 4; jmin++; 3187fbae186SHong Zhang } 319444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 320444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 321831a3094SHong Zhang for (j=jmin; j<n; j++) { 32249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 32349b5e25fSSatish Balay cval = ib[j]*2; 32449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 32549b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 32649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 32749b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 32849b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 32949b5e25fSSatish Balay v += 4; 33049b5e25fSSatish Balay } 33149b5e25fSSatish Balay xb +=2; ai++; 33249b5e25fSSatish Balay } 33349b5e25fSSatish Balay 334d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3351ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 336dc0b31edSSatish Balay ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 33749b5e25fSSatish Balay PetscFunctionReturn(0); 33849b5e25fSSatish Balay } 33949b5e25fSSatish Balay 340dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 34149b5e25fSSatish Balay { 34249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 343d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,zero=0.0; 344d9ca1df4SBarry Smith const PetscScalar *x,*xb; 345d9ca1df4SBarry Smith const MatScalar *v; 3466849ba73SBarry Smith PetscErrorCode ierr; 347d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 348d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 34998c9bda7SSatish Balay PetscInt nonzerorow=0; 35049b5e25fSSatish Balay 35149b5e25fSSatish Balay PetscFunctionBegin; 3522dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 353c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 354d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 3551ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 35649b5e25fSSatish Balay 35749b5e25fSSatish Balay v = a->a; 35849b5e25fSSatish Balay xb = x; 35949b5e25fSSatish Balay 36049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 36149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 36249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 36349b5e25fSSatish Balay ib = aj + *ai; 364831a3094SHong Zhang jmin = 0; 36598c9bda7SSatish Balay nonzerorow += (n>0); 3667fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 36749b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 36849b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 36949b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 370831a3094SHong Zhang v += 9; jmin++; 3717fbae186SHong Zhang } 372444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 373444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 374831a3094SHong Zhang for (j=jmin; j<n; j++) { 37549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 37649b5e25fSSatish Balay cval = ib[j]*3; 37749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 37849b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 37949b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 38049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 38149b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 38249b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 38349b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 38449b5e25fSSatish Balay v += 9; 38549b5e25fSSatish Balay } 38649b5e25fSSatish Balay xb +=3; ai++; 38749b5e25fSSatish Balay } 38849b5e25fSSatish Balay 389d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 3901ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 391dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 39249b5e25fSSatish Balay PetscFunctionReturn(0); 39349b5e25fSSatish Balay } 39449b5e25fSSatish Balay 395dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 39649b5e25fSSatish Balay { 39749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 398d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,zero=0.0; 399d9ca1df4SBarry Smith const PetscScalar *x,*xb; 400d9ca1df4SBarry Smith const MatScalar *v; 4016849ba73SBarry Smith PetscErrorCode ierr; 402d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 403d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 40498c9bda7SSatish Balay PetscInt nonzerorow = 0; 40549b5e25fSSatish Balay 40649b5e25fSSatish Balay PetscFunctionBegin; 4072dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 408c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 409d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4101ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 41149b5e25fSSatish Balay 41249b5e25fSSatish Balay v = a->a; 41349b5e25fSSatish Balay xb = x; 41449b5e25fSSatish Balay 41549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 41649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 41749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 41849b5e25fSSatish Balay ib = aj + *ai; 419831a3094SHong Zhang jmin = 0; 42098c9bda7SSatish Balay nonzerorow += (n>0); 4217fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 42249b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 42349b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 42449b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 42549b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 426831a3094SHong Zhang v += 16; jmin++; 4277fbae186SHong Zhang } 428444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 429444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 430831a3094SHong Zhang for (j=jmin; j<n; j++) { 43149b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 43249b5e25fSSatish Balay cval = ib[j]*4; 43349b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 43449b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 43549b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 43649b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 43749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 43849b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 43949b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 44049b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 44149b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 44249b5e25fSSatish Balay v += 16; 44349b5e25fSSatish Balay } 44449b5e25fSSatish Balay xb +=4; ai++; 44549b5e25fSSatish Balay } 44649b5e25fSSatish Balay 447d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 4481ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 449dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 45049b5e25fSSatish Balay PetscFunctionReturn(0); 45149b5e25fSSatish Balay } 45249b5e25fSSatish Balay 453dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 45449b5e25fSSatish Balay { 45549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 456d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 457d9ca1df4SBarry Smith const PetscScalar *x,*xb; 458d9ca1df4SBarry Smith const MatScalar *v; 4596849ba73SBarry Smith PetscErrorCode ierr; 460d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 461d9ca1df4SBarry Smith const PetscInt *aj = a->j,*ai = a->i,*ib; 46298c9bda7SSatish Balay PetscInt nonzerorow=0; 46349b5e25fSSatish Balay 46449b5e25fSSatish Balay PetscFunctionBegin; 4652dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 466c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 467d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 4681ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 46949b5e25fSSatish Balay 47049b5e25fSSatish Balay v = a->a; 47149b5e25fSSatish Balay xb = x; 47249b5e25fSSatish Balay 47349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 47449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 47549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 47649b5e25fSSatish Balay ib = aj + *ai; 477831a3094SHong Zhang jmin = 0; 47898c9bda7SSatish Balay nonzerorow += (n>0); 4797fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 48049b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 48149b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 48249b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 48349b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 48449b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 485831a3094SHong Zhang v += 25; jmin++; 4867fbae186SHong Zhang } 487444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 488444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 489831a3094SHong Zhang for (j=jmin; j<n; j++) { 49049b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 49149b5e25fSSatish Balay cval = ib[j]*5; 49249b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 49349b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 49449b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 49549b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 49649b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 49749b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 49849b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 49949b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 50049b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 50149b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 50249b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 50349b5e25fSSatish Balay v += 25; 50449b5e25fSSatish Balay } 50549b5e25fSSatish Balay xb +=5; ai++; 50649b5e25fSSatish Balay } 50749b5e25fSSatish Balay 508d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5091ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 510dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 51149b5e25fSSatish Balay PetscFunctionReturn(0); 51249b5e25fSSatish Balay } 51349b5e25fSSatish Balay 514dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 51549b5e25fSSatish Balay { 51649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 517d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 518d9ca1df4SBarry Smith const PetscScalar *x,*xb; 519d9ca1df4SBarry Smith const MatScalar *v; 5206849ba73SBarry Smith PetscErrorCode ierr; 521d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 522d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 52398c9bda7SSatish Balay PetscInt nonzerorow=0; 52449b5e25fSSatish Balay 52549b5e25fSSatish Balay PetscFunctionBegin; 5262dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 527c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 528d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5291ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 53049b5e25fSSatish Balay 53149b5e25fSSatish Balay v = a->a; 53249b5e25fSSatish Balay xb = x; 53349b5e25fSSatish Balay 53449b5e25fSSatish Balay for (i=0; i<mbs; i++) { 53549b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 53649b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 53749b5e25fSSatish Balay ib = aj + *ai; 538831a3094SHong Zhang jmin = 0; 53998c9bda7SSatish Balay nonzerorow += (n>0); 5407fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 54149b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 54249b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 54349b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 54449b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 54549b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 54649b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 547831a3094SHong Zhang v += 36; jmin++; 5487fbae186SHong Zhang } 549444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 550444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 551831a3094SHong Zhang for (j=jmin; j<n; j++) { 55249b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 55349b5e25fSSatish Balay cval = ib[j]*6; 55449b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 55549b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 55649b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 55749b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 55849b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 55949b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 56049b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 56149b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 56249b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 56349b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 56449b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 56549b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 56649b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 56749b5e25fSSatish Balay v += 36; 56849b5e25fSSatish Balay } 56949b5e25fSSatish Balay xb +=6; ai++; 57049b5e25fSSatish Balay } 57149b5e25fSSatish Balay 572d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 5731ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 574dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 57549b5e25fSSatish Balay PetscFunctionReturn(0); 57649b5e25fSSatish Balay } 577c2916339SPierre Jolivet 578dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 57949b5e25fSSatish Balay { 58049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 581d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 582d9ca1df4SBarry Smith const PetscScalar *x,*xb; 583d9ca1df4SBarry Smith const MatScalar *v; 5846849ba73SBarry Smith PetscErrorCode ierr; 585d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 586d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 58798c9bda7SSatish Balay PetscInt nonzerorow=0; 58849b5e25fSSatish Balay 58949b5e25fSSatish Balay PetscFunctionBegin; 5902dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 591c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 592d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 5931ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 59449b5e25fSSatish Balay 59549b5e25fSSatish Balay v = a->a; 59649b5e25fSSatish Balay xb = x; 59749b5e25fSSatish Balay 59849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 59949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 60049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 60149b5e25fSSatish Balay ib = aj + *ai; 602831a3094SHong Zhang jmin = 0; 60398c9bda7SSatish Balay nonzerorow += (n>0); 6047fbae186SHong Zhang if (*ib == i) { /* (diag of A)*x */ 60549b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 60649b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 60749b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 60849b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 60949b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 61049b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 61149b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 612831a3094SHong Zhang v += 49; jmin++; 6137fbae186SHong Zhang } 614444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 615444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 616831a3094SHong Zhang for (j=jmin; j<n; j++) { 61749b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 61849b5e25fSSatish Balay cval = ib[j]*7; 61949b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 62049b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 62149b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 62249b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 62349b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 62449b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 62549b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 62649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 62749b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 62849b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 62949b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 63049b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 63149b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 63249b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 63349b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 63449b5e25fSSatish Balay v += 49; 63549b5e25fSSatish Balay } 63649b5e25fSSatish Balay xb +=7; ai++; 63749b5e25fSSatish Balay } 638d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 6391ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 640dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 64149b5e25fSSatish Balay PetscFunctionReturn(0); 64249b5e25fSSatish Balay } 64349b5e25fSSatish Balay 64449b5e25fSSatish Balay /* 64549b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 64649b5e25fSSatish Balay */ 647dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 64849b5e25fSSatish Balay { 64949b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 650d9ca1df4SBarry Smith PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 651d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 652d9ca1df4SBarry Smith const MatScalar *v; 653dfbe8321SBarry Smith PetscErrorCode ierr; 654d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 655d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 65698c9bda7SSatish Balay PetscInt nonzerorow=0; 65749b5e25fSSatish Balay 65849b5e25fSSatish Balay PetscFunctionBegin; 6592dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 660c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 66159689ffbSStefano Zampini ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 66259689ffbSStefano Zampini ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 66359689ffbSStefano Zampini 66459689ffbSStefano Zampini x_ptr = x; 66559689ffbSStefano Zampini z_ptr = z; 66649b5e25fSSatish Balay 66749b5e25fSSatish Balay aj = a->j; 66849b5e25fSSatish Balay v = a->a; 66949b5e25fSSatish Balay ii = a->i; 67049b5e25fSSatish Balay 67149b5e25fSSatish Balay if (!a->mult_work) { 672854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 67349b5e25fSSatish Balay } 67449b5e25fSSatish Balay work = a->mult_work; 67549b5e25fSSatish Balay 67649b5e25fSSatish Balay for (i=0; i<mbs; i++) { 67749b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 67849b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 67998c9bda7SSatish Balay nonzerorow += (n>0); 68049b5e25fSSatish Balay 68149b5e25fSSatish Balay /* upper triangular part */ 68249b5e25fSSatish Balay for (j=0; j<n; j++) { 68349b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 68449b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 68549b5e25fSSatish Balay workt += bs; 68649b5e25fSSatish Balay } 68749b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 68896b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 68949b5e25fSSatish Balay 69049b5e25fSSatish Balay /* strict lower triangular part */ 691831a3094SHong Zhang idx = aj+ii[0]; 69259689ffbSStefano Zampini if (n && *idx == i) { 69396b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 694831a3094SHong Zhang } 69596b9376eSHong Zhang 69649b5e25fSSatish Balay if (ncols > 0) { 69749b5e25fSSatish Balay workt = work; 698580bdb30SBarry Smith ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 69996b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 700831a3094SHong Zhang for (j=0; j<n; j++) { 701831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 70249b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 70349b5e25fSSatish Balay workt += bs; 70449b5e25fSSatish Balay } 70549b5e25fSSatish Balay } 70649b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 70749b5e25fSSatish Balay } 70849b5e25fSSatish Balay 709d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 7101ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 711dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 71249b5e25fSSatish Balay PetscFunctionReturn(0); 71349b5e25fSSatish Balay } 71449b5e25fSSatish Balay 715dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 71649b5e25fSSatish Balay { 71749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 718d9ca1df4SBarry Smith PetscScalar *z,x1; 719d9ca1df4SBarry Smith const PetscScalar *x,*xb; 720d9ca1df4SBarry Smith const MatScalar *v; 7216849ba73SBarry Smith PetscErrorCode ierr; 722d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 723d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 72498c9bda7SSatish Balay PetscInt nonzerorow=0; 725eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 726eb1ec7c1SStefano Zampini const int aconj = A->hermitian; 727eb1ec7c1SStefano Zampini #else 728eb1ec7c1SStefano Zampini const int aconj = 0; 729eb1ec7c1SStefano Zampini #endif 73049b5e25fSSatish Balay 73149b5e25fSSatish Balay PetscFunctionBegin; 732343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 733c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 734d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7351ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 73649b5e25fSSatish Balay v = a->a; 73749b5e25fSSatish Balay xb = x; 73849b5e25fSSatish Balay 73949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 74049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 74149b5e25fSSatish Balay x1 = xb[0]; 74249b5e25fSSatish Balay ib = aj + *ai; 743831a3094SHong Zhang jmin = 0; 74498c9bda7SSatish Balay nonzerorow += (n>0); 7453d9ade75SStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 746831a3094SHong Zhang z[i] += *v++ * x[*ib++]; jmin++; 747831a3094SHong Zhang } 748eb1ec7c1SStefano Zampini if (aconj) { 749eb1ec7c1SStefano Zampini for (j=jmin; j<n; j++) { 750eb1ec7c1SStefano Zampini cval = *ib; 751eb1ec7c1SStefano Zampini z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 752eb1ec7c1SStefano Zampini z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 753eb1ec7c1SStefano Zampini } 754eb1ec7c1SStefano Zampini } else { 755831a3094SHong Zhang for (j=jmin; j<n; j++) { 75649b5e25fSSatish Balay cval = *ib; 75749b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 75849b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 75949b5e25fSSatish Balay } 760eb1ec7c1SStefano Zampini } 76149b5e25fSSatish Balay xb++; ai++; 76249b5e25fSSatish Balay } 76349b5e25fSSatish Balay 764d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 765bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 76649b5e25fSSatish Balay 767dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 76849b5e25fSSatish Balay PetscFunctionReturn(0); 76949b5e25fSSatish Balay } 77049b5e25fSSatish Balay 771dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 77249b5e25fSSatish Balay { 77349b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 774d9ca1df4SBarry Smith PetscScalar *z,x1,x2; 775d9ca1df4SBarry Smith const PetscScalar *x,*xb; 776d9ca1df4SBarry Smith const MatScalar *v; 7776849ba73SBarry Smith PetscErrorCode ierr; 778d9ca1df4SBarry Smith PetscInt mbs =a->mbs,i,n,cval,j,jmin; 779d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 78098c9bda7SSatish Balay PetscInt nonzerorow=0; 78149b5e25fSSatish Balay 78249b5e25fSSatish Balay PetscFunctionBegin; 783343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 784c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 785d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7861ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 78749b5e25fSSatish Balay 78849b5e25fSSatish Balay v = a->a; 78949b5e25fSSatish Balay xb = x; 79049b5e25fSSatish Balay 79149b5e25fSSatish Balay for (i=0; i<mbs; i++) { 79249b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 79349b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 79449b5e25fSSatish Balay ib = aj + *ai; 795831a3094SHong Zhang jmin = 0; 79698c9bda7SSatish Balay nonzerorow += (n>0); 79759689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 79849b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 79949b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 800831a3094SHong Zhang v += 4; jmin++; 8017fbae186SHong Zhang } 802444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 803444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 804831a3094SHong Zhang for (j=jmin; j<n; j++) { 80549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 80649b5e25fSSatish Balay cval = ib[j]*2; 80749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 80849b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 80949b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 81049b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 81149b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 81249b5e25fSSatish Balay v += 4; 81349b5e25fSSatish Balay } 81449b5e25fSSatish Balay xb +=2; ai++; 81549b5e25fSSatish Balay } 816d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 817bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 81849b5e25fSSatish Balay 819dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 82049b5e25fSSatish Balay PetscFunctionReturn(0); 82149b5e25fSSatish Balay } 82249b5e25fSSatish Balay 823dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 82449b5e25fSSatish Balay { 82549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 826d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3; 827d9ca1df4SBarry Smith const PetscScalar *x,*xb; 828d9ca1df4SBarry Smith const MatScalar *v; 8296849ba73SBarry Smith PetscErrorCode ierr; 830d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 831d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 83298c9bda7SSatish Balay PetscInt nonzerorow=0; 83349b5e25fSSatish Balay 83449b5e25fSSatish Balay PetscFunctionBegin; 835343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 836c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 837d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8381ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 83949b5e25fSSatish Balay 84049b5e25fSSatish Balay v = a->a; 84149b5e25fSSatish Balay xb = x; 84249b5e25fSSatish Balay 84349b5e25fSSatish Balay for (i=0; i<mbs; i++) { 84449b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 84549b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 84649b5e25fSSatish Balay ib = aj + *ai; 847831a3094SHong Zhang jmin = 0; 84898c9bda7SSatish Balay nonzerorow += (n>0); 84959689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 85049b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 85149b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 85249b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 853831a3094SHong Zhang v += 9; jmin++; 8547fbae186SHong Zhang } 855444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 856444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 857831a3094SHong Zhang for (j=jmin; j<n; j++) { 85849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 85949b5e25fSSatish Balay cval = ib[j]*3; 86049b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 86149b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 86249b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 86349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 86449b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 86549b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 86649b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 86749b5e25fSSatish Balay v += 9; 86849b5e25fSSatish Balay } 86949b5e25fSSatish Balay xb +=3; ai++; 87049b5e25fSSatish Balay } 87149b5e25fSSatish Balay 872d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 873bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 87449b5e25fSSatish Balay 875dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 87649b5e25fSSatish Balay PetscFunctionReturn(0); 87749b5e25fSSatish Balay } 87849b5e25fSSatish Balay 879dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 88049b5e25fSSatish Balay { 88149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 882d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4; 883d9ca1df4SBarry Smith const PetscScalar *x,*xb; 884d9ca1df4SBarry Smith const MatScalar *v; 8856849ba73SBarry Smith PetscErrorCode ierr; 886d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 887d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 88898c9bda7SSatish Balay PetscInt nonzerorow=0; 88949b5e25fSSatish Balay 89049b5e25fSSatish Balay PetscFunctionBegin; 891343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 892c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 893d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8941ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 89549b5e25fSSatish Balay 89649b5e25fSSatish Balay v = a->a; 89749b5e25fSSatish Balay xb = x; 89849b5e25fSSatish Balay 89949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 90049b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 90149b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 90249b5e25fSSatish Balay ib = aj + *ai; 903831a3094SHong Zhang jmin = 0; 90498c9bda7SSatish Balay nonzerorow += (n>0); 90559689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 90649b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 90749b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 90849b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 90949b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 910831a3094SHong Zhang v += 16; jmin++; 9117fbae186SHong Zhang } 912444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 913444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 914831a3094SHong Zhang for (j=jmin; j<n; j++) { 91549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 91649b5e25fSSatish Balay cval = ib[j]*4; 91749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 91849b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 91949b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 92049b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 92149b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 92249b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 92349b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 92449b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 92549b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 92649b5e25fSSatish Balay v += 16; 92749b5e25fSSatish Balay } 92849b5e25fSSatish Balay xb +=4; ai++; 92949b5e25fSSatish Balay } 93049b5e25fSSatish Balay 931d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 932bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 93349b5e25fSSatish Balay 934dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 93549b5e25fSSatish Balay PetscFunctionReturn(0); 93649b5e25fSSatish Balay } 93749b5e25fSSatish Balay 938dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 93949b5e25fSSatish Balay { 94049b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 941d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5; 942d9ca1df4SBarry Smith const PetscScalar *x,*xb; 943d9ca1df4SBarry Smith const MatScalar *v; 9446849ba73SBarry Smith PetscErrorCode ierr; 945d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 946d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 94798c9bda7SSatish Balay PetscInt nonzerorow=0; 94849b5e25fSSatish Balay 94949b5e25fSSatish Balay PetscFunctionBegin; 950343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 951c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 952d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9531ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 95449b5e25fSSatish Balay 95549b5e25fSSatish Balay v = a->a; 95649b5e25fSSatish Balay xb = x; 95749b5e25fSSatish Balay 95849b5e25fSSatish Balay for (i=0; i<mbs; i++) { 95949b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 96049b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 96149b5e25fSSatish Balay ib = aj + *ai; 962831a3094SHong Zhang jmin = 0; 96398c9bda7SSatish Balay nonzerorow += (n>0); 96459689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 96549b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 96649b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 96749b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 96849b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 96949b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 970831a3094SHong Zhang v += 25; jmin++; 9717fbae186SHong Zhang } 972444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 973444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 974831a3094SHong Zhang for (j=jmin; j<n; j++) { 97549b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 97649b5e25fSSatish Balay cval = ib[j]*5; 97749b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 97849b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 97949b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 98049b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 98149b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 98249b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 98349b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 98449b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 98549b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 98649b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 98749b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 98849b5e25fSSatish Balay v += 25; 98949b5e25fSSatish Balay } 99049b5e25fSSatish Balay xb +=5; ai++; 99149b5e25fSSatish Balay } 99249b5e25fSSatish Balay 993d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 994bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 99549b5e25fSSatish Balay 996dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 99749b5e25fSSatish Balay PetscFunctionReturn(0); 99849b5e25fSSatish Balay } 999c2916339SPierre Jolivet 1000dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 100149b5e25fSSatish Balay { 100249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1003d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6; 1004d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1005d9ca1df4SBarry Smith const MatScalar *v; 10066849ba73SBarry Smith PetscErrorCode ierr; 1007d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1008d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 100998c9bda7SSatish Balay PetscInt nonzerorow=0; 101049b5e25fSSatish Balay 101149b5e25fSSatish Balay PetscFunctionBegin; 1012343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1013c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 1014d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10151ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 101649b5e25fSSatish Balay 101749b5e25fSSatish Balay v = a->a; 101849b5e25fSSatish Balay xb = x; 101949b5e25fSSatish Balay 102049b5e25fSSatish Balay for (i=0; i<mbs; i++) { 102149b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 102249b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 102349b5e25fSSatish Balay ib = aj + *ai; 1024831a3094SHong Zhang jmin = 0; 102598c9bda7SSatish Balay nonzerorow += (n>0); 102659689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 102749b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 102849b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 102949b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 103049b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 103149b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 103249b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1033831a3094SHong Zhang v += 36; jmin++; 10347fbae186SHong Zhang } 1035444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1036444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1037831a3094SHong Zhang for (j=jmin; j<n; j++) { 103849b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 103949b5e25fSSatish Balay cval = ib[j]*6; 104049b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 104149b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 104249b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 104349b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 104449b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 104549b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 104649b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 104749b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 104849b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 104949b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 105049b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 105149b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 105249b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 105349b5e25fSSatish Balay v += 36; 105449b5e25fSSatish Balay } 105549b5e25fSSatish Balay xb +=6; ai++; 105649b5e25fSSatish Balay } 105749b5e25fSSatish Balay 1058d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1059bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 106049b5e25fSSatish Balay 1061dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 106249b5e25fSSatish Balay PetscFunctionReturn(0); 106349b5e25fSSatish Balay } 106449b5e25fSSatish Balay 1065dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 106649b5e25fSSatish Balay { 106749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1068d9ca1df4SBarry Smith PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1069d9ca1df4SBarry Smith const PetscScalar *x,*xb; 1070d9ca1df4SBarry Smith const MatScalar *v; 10716849ba73SBarry Smith PetscErrorCode ierr; 1072d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1073d9ca1df4SBarry Smith const PetscInt *aj=a->j,*ai=a->i,*ib; 107498c9bda7SSatish Balay PetscInt nonzerorow=0; 107549b5e25fSSatish Balay 107649b5e25fSSatish Balay PetscFunctionBegin; 1077343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1078c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 1079d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 10801ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 108149b5e25fSSatish Balay 108249b5e25fSSatish Balay v = a->a; 108349b5e25fSSatish Balay xb = x; 108449b5e25fSSatish Balay 108549b5e25fSSatish Balay for (i=0; i<mbs; i++) { 108649b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 108749b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 108849b5e25fSSatish Balay ib = aj + *ai; 1089831a3094SHong Zhang jmin = 0; 109098c9bda7SSatish Balay nonzerorow += (n>0); 109159689ffbSStefano Zampini if (n && *ib == i) { /* (diag of A)*x */ 109249b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 109349b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 109449b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 109549b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 109649b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 109749b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 109849b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1099831a3094SHong Zhang v += 49; jmin++; 11007fbae186SHong Zhang } 1101444d8c10SJed Brown PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1102444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1103831a3094SHong Zhang for (j=jmin; j<n; j++) { 110449b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 110549b5e25fSSatish Balay cval = ib[j]*7; 110649b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 110749b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 110849b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 110949b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 111049b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 111149b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 111249b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 111349b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 111449b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 111549b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 111649b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 111749b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 111849b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 111949b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 112049b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 112149b5e25fSSatish Balay v += 49; 112249b5e25fSSatish Balay } 112349b5e25fSSatish Balay xb +=7; ai++; 112449b5e25fSSatish Balay } 112549b5e25fSSatish Balay 1126d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1127bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 112849b5e25fSSatish Balay 1129dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 113049b5e25fSSatish Balay PetscFunctionReturn(0); 113149b5e25fSSatish Balay } 113249b5e25fSSatish Balay 1133dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 113449b5e25fSSatish Balay { 113549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1136f4259b30SLisandro Dalcin PetscScalar *z,*z_ptr=NULL,*zb,*work,*workt; 1137d9ca1df4SBarry Smith const PetscScalar *x,*x_ptr,*xb; 1138d9ca1df4SBarry Smith const MatScalar *v; 1139dfbe8321SBarry Smith PetscErrorCode ierr; 1140d9ca1df4SBarry Smith PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1141d9ca1df4SBarry Smith const PetscInt *idx,*aj,*ii; 114298c9bda7SSatish Balay PetscInt nonzerorow=0; 114349b5e25fSSatish Balay 114449b5e25fSSatish Balay PetscFunctionBegin; 1145343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1146c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 1147d9ca1df4SBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 11481ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 114949b5e25fSSatish Balay 115049b5e25fSSatish Balay aj = a->j; 115149b5e25fSSatish Balay v = a->a; 115249b5e25fSSatish Balay ii = a->i; 115349b5e25fSSatish Balay 115449b5e25fSSatish Balay if (!a->mult_work) { 1155854ce69bSBarry Smith ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 115649b5e25fSSatish Balay } 115749b5e25fSSatish Balay work = a->mult_work; 115849b5e25fSSatish Balay 115949b5e25fSSatish Balay for (i=0; i<mbs; i++) { 116049b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 116149b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 116298c9bda7SSatish Balay nonzerorow += (n>0); 116349b5e25fSSatish Balay 116449b5e25fSSatish Balay /* upper triangular part */ 116549b5e25fSSatish Balay for (j=0; j<n; j++) { 116649b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 116749b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 116849b5e25fSSatish Balay workt += bs; 116949b5e25fSSatish Balay } 117049b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 117196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 117249b5e25fSSatish Balay 117349b5e25fSSatish Balay /* strict lower triangular part */ 1174831a3094SHong Zhang idx = aj+ii[0]; 117559689ffbSStefano Zampini if (n && *idx == i) { 117696b9376eSHong Zhang ncols -= bs; v += bs2; idx++; n--; 1177831a3094SHong Zhang } 117849b5e25fSSatish Balay if (ncols > 0) { 117949b5e25fSSatish Balay workt = work; 1180580bdb30SBarry Smith ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr); 118196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1182831a3094SHong Zhang for (j=0; j<n; j++) { 1183831a3094SHong Zhang zb = z_ptr + bs*(*idx++); 118449b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 118549b5e25fSSatish Balay workt += bs; 118649b5e25fSSatish Balay } 118749b5e25fSSatish Balay } 118849b5e25fSSatish Balay 118949b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 119049b5e25fSSatish Balay } 119149b5e25fSSatish Balay 1192d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1193bba805e6SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 119449b5e25fSSatish Balay 1195dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 119649b5e25fSSatish Balay PetscFunctionReturn(0); 119749b5e25fSSatish Balay } 119849b5e25fSSatish Balay 1199f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 120049b5e25fSSatish Balay { 120149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1202f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1203efee365bSSatish Balay PetscErrorCode ierr; 1204c5df96a5SBarry Smith PetscBLASInt one = 1,totalnz; 120549b5e25fSSatish Balay 120649b5e25fSSatish Balay PetscFunctionBegin; 1207c5df96a5SBarry Smith ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 12088b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1209efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 121049b5e25fSSatish Balay PetscFunctionReturn(0); 121149b5e25fSSatish Balay } 121249b5e25fSSatish Balay 1213dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 121449b5e25fSSatish Balay { 121549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1216d9ca1df4SBarry Smith const MatScalar *v = a->a; 121749b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1218d9ca1df4SBarry Smith PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 12196849ba73SBarry Smith PetscErrorCode ierr; 1220d9ca1df4SBarry Smith const PetscInt *aj=a->j,*col; 122149b5e25fSSatish Balay 122249b5e25fSSatish Balay PetscFunctionBegin; 1223c40ae873SPierre Jolivet if (!a->nz) { 1224c40ae873SPierre Jolivet *norm = 0.0; 1225c40ae873SPierre Jolivet PetscFunctionReturn(0); 1226c40ae873SPierre Jolivet } 122749b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 122849b5e25fSSatish Balay for (k=0; k<mbs; k++) { 122959689ffbSStefano Zampini jmin = a->i[k]; 123059689ffbSStefano Zampini jmax = a->i[k+1]; 1231831a3094SHong Zhang col = aj + jmin; 123259689ffbSStefano Zampini if (jmax-jmin > 0 && *col == k) { /* diagonal block */ 123349b5e25fSSatish Balay for (i=0; i<bs2; i++) { 123449b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 123549b5e25fSSatish Balay } 1236831a3094SHong Zhang jmin++; 1237831a3094SHong Zhang } 1238831a3094SHong Zhang for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 123949b5e25fSSatish Balay for (i=0; i<bs2; i++) { 124049b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 124149b5e25fSSatish Balay } 124249b5e25fSSatish Balay } 124349b5e25fSSatish Balay } 12448f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1245ca0c957dSBarry Smith ierr = PetscLogFlops(2.0*bs2*a->nz);CHKERRQ(ierr); 12460b8dc8d2SHong Zhang } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1247dcca6d9dSJed Brown ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 12480b8dc8d2SHong Zhang for (i=0; i<mbs; i++) jl[i] = mbs; 12490b8dc8d2SHong Zhang il[0] = 0; 125049b5e25fSSatish Balay 125149b5e25fSSatish Balay *norm = 0.0; 125249b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 125349b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 125449b5e25fSSatish Balay /*-- col sum --*/ 125549b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1256831a3094SHong Zhang /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1257831a3094SHong Zhang at step k */ 125849b5e25fSSatish Balay while (i<mbs) { 125949b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 126049b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 126149b5e25fSSatish Balay for (j=0; j<bs; j++) { 126249b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 126349b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 126449b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 126549b5e25fSSatish Balay } 126649b5e25fSSatish Balay } 126749b5e25fSSatish Balay /* update il, jl */ 1268831a3094SHong Zhang jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1269831a3094SHong Zhang jmax = a->i[i+1]; 127049b5e25fSSatish Balay if (jmin < jmax) { 127149b5e25fSSatish Balay il[i] = jmin; 127249b5e25fSSatish Balay j = a->j[jmin]; 127349b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 127449b5e25fSSatish Balay } 127549b5e25fSSatish Balay i = nexti; 127649b5e25fSSatish Balay } 127749b5e25fSSatish Balay /*-- row sum --*/ 127859689ffbSStefano Zampini jmin = a->i[k]; 127959689ffbSStefano Zampini jmax = a->i[k+1]; 128049b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 128149b5e25fSSatish Balay for (j=0; j<bs; j++) { 128249b5e25fSSatish Balay v = a->a + i*bs2 + j; 128349b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 12840b8dc8d2SHong Zhang sum[j] += PetscAbsScalar(*v); v += bs; 128549b5e25fSSatish Balay } 128649b5e25fSSatish Balay } 128749b5e25fSSatish Balay } 128849b5e25fSSatish Balay /* add k_th block row to il, jl */ 1289831a3094SHong Zhang col = aj+jmin; 129059689ffbSStefano Zampini if (jmax - jmin > 0 && *col == k) jmin++; 129149b5e25fSSatish Balay if (jmin < jmax) { 129249b5e25fSSatish Balay il[k] = jmin; 12930b8dc8d2SHong Zhang j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 129449b5e25fSSatish Balay } 129549b5e25fSSatish Balay for (j=0; j<bs; j++) { 129649b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 129749b5e25fSSatish Balay } 129849b5e25fSSatish Balay } 129974ed9c26SBarry Smith ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 130051f70360SJed Brown ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1301f23aa3ddSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 130249b5e25fSSatish Balay PetscFunctionReturn(0); 130349b5e25fSSatish Balay } 130449b5e25fSSatish Balay 1305ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 130649b5e25fSSatish Balay { 130749b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1308dfbe8321SBarry Smith PetscErrorCode ierr; 130949b5e25fSSatish Balay 131049b5e25fSSatish Balay PetscFunctionBegin; 131149b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1312d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1313ef511fbeSHong Zhang *flg = PETSC_FALSE; 1314ef511fbeSHong Zhang PetscFunctionReturn(0); 131549b5e25fSSatish Balay } 131649b5e25fSSatish Balay 131749b5e25fSSatish Balay /* if the a->i are the same */ 1318580bdb30SBarry Smith ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr); 131926fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 132049b5e25fSSatish Balay 132149b5e25fSSatish Balay /* if a->j are the same */ 1322580bdb30SBarry Smith ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr); 132326fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 132426fbe8dcSKarl Rupp 132549b5e25fSSatish Balay /* if a->a are the same */ 1326580bdb30SBarry Smith ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr); 1327935af2e7SHong Zhang PetscFunctionReturn(0); 132849b5e25fSSatish Balay } 132949b5e25fSSatish Balay 1330dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 133149b5e25fSSatish Balay { 133249b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1333dfbe8321SBarry Smith PetscErrorCode ierr; 1334d9ca1df4SBarry Smith PetscInt i,j,k,row,bs,ambs,bs2; 1335d9ca1df4SBarry Smith const PetscInt *ai,*aj; 133687828ca2SBarry Smith PetscScalar *x,zero = 0.0; 1337d9ca1df4SBarry Smith const MatScalar *aa,*aa_j; 133849b5e25fSSatish Balay 133949b5e25fSSatish Balay PetscFunctionBegin; 1340d0f46423SBarry Smith bs = A->rmap->bs; 1341*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype && bs>1,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 134282799104SHong Zhang 134349b5e25fSSatish Balay aa = a->a; 13448a0c6efdSHong Zhang ambs = a->mbs; 13458a0c6efdSHong Zhang 13468a0c6efdSHong Zhang if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 13478a0c6efdSHong Zhang PetscInt *diag=a->diag; 13488a0c6efdSHong Zhang aa = a->a; 13498a0c6efdSHong Zhang ambs = a->mbs; 13508a0c6efdSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 13518a0c6efdSHong Zhang for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 13528a0c6efdSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 13538a0c6efdSHong Zhang PetscFunctionReturn(0); 13548a0c6efdSHong Zhang } 13558a0c6efdSHong Zhang 135649b5e25fSSatish Balay ai = a->i; 135749b5e25fSSatish Balay aj = a->j; 135849b5e25fSSatish Balay bs2 = a->bs2; 13592dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 1360c40ae873SPierre Jolivet if (!a->nz) PetscFunctionReturn(0); 13611ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 136249b5e25fSSatish Balay for (i=0; i<ambs; i++) { 136349b5e25fSSatish Balay j = ai[i]; 136449b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 136549b5e25fSSatish Balay row = i*bs; 136649b5e25fSSatish Balay aa_j = aa + j*bs2; 136749b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 136849b5e25fSSatish Balay } 136949b5e25fSSatish Balay } 13701ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 137149b5e25fSSatish Balay PetscFunctionReturn(0); 137249b5e25fSSatish Balay } 137349b5e25fSSatish Balay 1374dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 137549b5e25fSSatish Balay { 137649b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1377d9ca1df4SBarry Smith PetscScalar x; 1378d9ca1df4SBarry Smith const PetscScalar *l,*li,*ri; 137949b5e25fSSatish Balay MatScalar *aa,*v; 1380dfbe8321SBarry Smith PetscErrorCode ierr; 1381fff8e43fSBarry Smith PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2; 1382fff8e43fSBarry Smith const PetscInt *ai,*aj; 1383ace3abfcSBarry Smith PetscBool flg; 138449b5e25fSSatish Balay 138549b5e25fSSatish Balay PetscFunctionBegin; 1386b3bf805bSHong Zhang if (ll != rr) { 1387b3bf805bSHong Zhang ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1388*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!flg,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same"); 1389b3bf805bSHong Zhang } 1390b3bf805bSHong Zhang if (!ll) PetscFunctionReturn(0); 139149b5e25fSSatish Balay ai = a->i; 139249b5e25fSSatish Balay aj = a->j; 139349b5e25fSSatish Balay aa = a->a; 1394d0f46423SBarry Smith m = A->rmap->N; 1395d0f46423SBarry Smith bs = A->rmap->bs; 139649b5e25fSSatish Balay mbs = a->mbs; 139749b5e25fSSatish Balay bs2 = a->bs2; 139849b5e25fSSatish Balay 1399d9ca1df4SBarry Smith ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 140049b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1401*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(lm != m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 140249b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 140349b5e25fSSatish Balay M = ai[i+1] - ai[i]; 140449b5e25fSSatish Balay li = l + i*bs; 140549b5e25fSSatish Balay v = aa + bs2*ai[i]; 140649b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 140749b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 14085e90f9d9SHong Zhang for (k=0; k<bs; k++) { 140949b5e25fSSatish Balay x = ri[k]; 141049b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 141149b5e25fSSatish Balay } 141249b5e25fSSatish Balay } 141349b5e25fSSatish Balay } 1414d9ca1df4SBarry Smith ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1415dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 141649b5e25fSSatish Balay PetscFunctionReturn(0); 141749b5e25fSSatish Balay } 141849b5e25fSSatish Balay 1419dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 142049b5e25fSSatish Balay { 142149b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 142249b5e25fSSatish Balay 142349b5e25fSSatish Balay PetscFunctionBegin; 142449b5e25fSSatish Balay info->block_size = a->bs2; 1425ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 14266c6c5352SBarry Smith info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 14273966268fSBarry Smith info->nz_unneeded = info->nz_allocated - info->nz_used; 142849b5e25fSSatish Balay info->assemblies = A->num_ass; 14298e58a170SBarry Smith info->mallocs = A->info.mallocs; 14307adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 1431d5f3da31SBarry Smith if (A->factortype) { 143249b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 143349b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 143449b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 143549b5e25fSSatish Balay } else { 143649b5e25fSSatish Balay info->fill_ratio_given = 0; 143749b5e25fSSatish Balay info->fill_ratio_needed = 0; 143849b5e25fSSatish Balay info->factor_mallocs = 0; 143949b5e25fSSatish Balay } 144049b5e25fSSatish Balay PetscFunctionReturn(0); 144149b5e25fSSatish Balay } 144249b5e25fSSatish Balay 1443dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 144449b5e25fSSatish Balay { 144549b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1446dfbe8321SBarry Smith PetscErrorCode ierr; 144749b5e25fSSatish Balay 144849b5e25fSSatish Balay PetscFunctionBegin; 1449580bdb30SBarry Smith ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr); 145049b5e25fSSatish Balay PetscFunctionReturn(0); 145149b5e25fSSatish Balay } 1452dc354874SHong Zhang 1453985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1454dc354874SHong Zhang { 1455dc354874SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1456dfbe8321SBarry Smith PetscErrorCode ierr; 1457d9ca1df4SBarry Smith PetscInt i,j,n,row,col,bs,mbs; 1458d9ca1df4SBarry Smith const PetscInt *ai,*aj; 1459c3fca9a7SHong Zhang PetscReal atmp; 1460d9ca1df4SBarry Smith const MatScalar *aa; 1461985db425SBarry Smith PetscScalar *x; 146213f74950SBarry Smith PetscInt ncols,brow,bcol,krow,kcol; 1463dc354874SHong Zhang 1464dc354874SHong Zhang PetscFunctionBegin; 1465*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(idx,PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1466*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1467d0f46423SBarry Smith bs = A->rmap->bs; 1468dc354874SHong Zhang aa = a->a; 1469dc354874SHong Zhang ai = a->i; 1470dc354874SHong Zhang aj = a->j; 147144117c81SHong Zhang mbs = a->mbs; 1472dc354874SHong Zhang 1473985db425SBarry Smith ierr = VecSet(v,0.0);CHKERRQ(ierr); 14741ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1475dc354874SHong Zhang ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1476*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(n != A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 147744117c81SHong Zhang for (i=0; i<mbs; i++) { 1478d0f6400bSHong Zhang ncols = ai[1] - ai[0]; ai++; 1479d0f6400bSHong Zhang brow = bs*i; 148044117c81SHong Zhang for (j=0; j<ncols; j++) { 1481d0f6400bSHong Zhang bcol = bs*(*aj); 148244117c81SHong Zhang for (kcol=0; kcol<bs; kcol++) { 1483d0f6400bSHong Zhang col = bcol + kcol; /* col index */ 148444117c81SHong Zhang for (krow=0; krow<bs; krow++) { 1485d0f6400bSHong Zhang atmp = PetscAbsScalar(*aa); aa++; 1486d0f6400bSHong Zhang row = brow + krow; /* row index */ 1487c3fca9a7SHong Zhang if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1488c3fca9a7SHong Zhang if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 148944117c81SHong Zhang } 149044117c81SHong Zhang } 1491d0f6400bSHong Zhang aj++; 1492dc354874SHong Zhang } 1493dc354874SHong Zhang } 14941ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1495dc354874SHong Zhang PetscFunctionReturn(0); 1496dc354874SHong Zhang } 1497c2916339SPierre Jolivet 14984222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1499c2916339SPierre Jolivet { 1500c2916339SPierre Jolivet PetscErrorCode ierr; 1501c2916339SPierre Jolivet 1502c2916339SPierre Jolivet PetscFunctionBegin; 1503c2916339SPierre Jolivet ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr); 15044222ddf1SHong Zhang C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1505c2916339SPierre Jolivet PetscFunctionReturn(0); 1506c2916339SPierre Jolivet } 1507c2916339SPierre Jolivet 1508c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1509c2916339SPierre Jolivet { 1510c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1511c2916339SPierre Jolivet PetscScalar *z = c; 1512c2916339SPierre Jolivet const PetscScalar *xb; 1513c2916339SPierre Jolivet PetscScalar x1; 1514c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1515c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 15163c2e41e1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 15173c2e41e1SStefano Zampini const int aconj = A->hermitian; 15183c2e41e1SStefano Zampini #else 15193c2e41e1SStefano Zampini const int aconj = 0; 15203c2e41e1SStefano Zampini #endif 1521c2916339SPierre Jolivet 1522c2916339SPierre Jolivet PetscFunctionBegin; 1523c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1524c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1525c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1526c2916339SPierre Jolivet PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1527c2916339SPierre Jolivet jj = idx; 1528c2916339SPierre Jolivet vv = v; 1529c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1530c2916339SPierre Jolivet idx = jj; 1531c2916339SPierre Jolivet v = vv; 1532c2916339SPierre Jolivet for (j=0; j<n; j++) { 1533c2916339SPierre Jolivet xb = b + (*idx); x1 = xb[0+k*bm]; 1534c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1; 15353c2e41e1SStefano Zampini if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm]; 1536c2916339SPierre Jolivet v += 1; 1537c2916339SPierre Jolivet ++idx; 1538c2916339SPierre Jolivet } 1539c2916339SPierre Jolivet } 1540c2916339SPierre Jolivet z += 1; 1541c2916339SPierre Jolivet } 1542c2916339SPierre Jolivet PetscFunctionReturn(0); 1543c2916339SPierre Jolivet } 1544c2916339SPierre Jolivet 1545c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1546c2916339SPierre Jolivet { 1547c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1548c2916339SPierre Jolivet PetscScalar *z = c; 1549c2916339SPierre Jolivet const PetscScalar *xb; 1550c2916339SPierre Jolivet PetscScalar x1,x2; 1551c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1552c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1553c2916339SPierre Jolivet 1554c2916339SPierre Jolivet PetscFunctionBegin; 1555c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1556c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1557c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1558c2916339SPierre Jolivet PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1559c2916339SPierre Jolivet jj = idx; 1560c2916339SPierre Jolivet vv = v; 1561c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1562c2916339SPierre Jolivet idx = jj; 1563c2916339SPierre Jolivet v = vv; 1564c2916339SPierre Jolivet for (j=0; j<n; j++) { 1565c2916339SPierre Jolivet xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 1566c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[2]*x2; 1567c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[3]*x2; 1568c2916339SPierre Jolivet if (*idx != i) { 1569c2916339SPierre Jolivet c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm]; 1570c2916339SPierre Jolivet c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm]; 1571c2916339SPierre Jolivet } 1572c2916339SPierre Jolivet v += 4; 1573c2916339SPierre Jolivet ++idx; 1574c2916339SPierre Jolivet } 1575c2916339SPierre Jolivet } 1576c2916339SPierre Jolivet z += 2; 1577c2916339SPierre Jolivet } 1578c2916339SPierre Jolivet PetscFunctionReturn(0); 1579c2916339SPierre Jolivet } 1580c2916339SPierre Jolivet 1581c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1582c2916339SPierre Jolivet { 1583c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1584c2916339SPierre Jolivet PetscScalar *z = c; 1585c2916339SPierre Jolivet const PetscScalar *xb; 1586c2916339SPierre Jolivet PetscScalar x1,x2,x3; 1587c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1588c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1589c2916339SPierre Jolivet 1590c2916339SPierre Jolivet PetscFunctionBegin; 1591c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1592c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1593c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1594c2916339SPierre Jolivet PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1595c2916339SPierre Jolivet jj = idx; 1596c2916339SPierre Jolivet vv = v; 1597c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1598c2916339SPierre Jolivet idx = jj; 1599c2916339SPierre Jolivet v = vv; 1600c2916339SPierre Jolivet for (j=0; j<n; j++) { 1601c2916339SPierre Jolivet xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 1602c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3; 1603c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3; 1604c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3; 1605c2916339SPierre Jolivet if (*idx != i) { 1606c2916339SPierre Jolivet c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm]; 1607c2916339SPierre Jolivet c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm]; 1608c2916339SPierre Jolivet c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm]; 1609c2916339SPierre Jolivet } 1610c2916339SPierre Jolivet v += 9; 1611c2916339SPierre Jolivet ++idx; 1612c2916339SPierre Jolivet } 1613c2916339SPierre Jolivet } 1614c2916339SPierre Jolivet z += 3; 1615c2916339SPierre Jolivet } 1616c2916339SPierre Jolivet PetscFunctionReturn(0); 1617c2916339SPierre Jolivet } 1618c2916339SPierre Jolivet 1619c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1620c2916339SPierre Jolivet { 1621c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1622c2916339SPierre Jolivet PetscScalar *z = c; 1623c2916339SPierre Jolivet const PetscScalar *xb; 1624c2916339SPierre Jolivet PetscScalar x1,x2,x3,x4; 1625c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1626c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1627c2916339SPierre Jolivet 1628c2916339SPierre Jolivet PetscFunctionBegin; 1629c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1630c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1631c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1632c2916339SPierre Jolivet PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1633c2916339SPierre Jolivet jj = idx; 1634c2916339SPierre Jolivet vv = v; 1635c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1636c2916339SPierre Jolivet idx = jj; 1637c2916339SPierre Jolivet v = vv; 1638c2916339SPierre Jolivet for (j=0; j<n; j++) { 1639c2916339SPierre Jolivet xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 1640c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1641c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1642c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1643c2916339SPierre Jolivet z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1644c2916339SPierre Jolivet if (*idx != i) { 1645c2916339SPierre Jolivet c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm]; 1646c2916339SPierre Jolivet c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm]; 1647c2916339SPierre Jolivet c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm]; 1648c2916339SPierre Jolivet c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm]; 1649c2916339SPierre Jolivet } 1650c2916339SPierre Jolivet v += 16; 1651c2916339SPierre Jolivet ++idx; 1652c2916339SPierre Jolivet } 1653c2916339SPierre Jolivet } 1654c2916339SPierre Jolivet z += 4; 1655c2916339SPierre Jolivet } 1656c2916339SPierre Jolivet PetscFunctionReturn(0); 1657c2916339SPierre Jolivet } 1658c2916339SPierre Jolivet 1659c2916339SPierre Jolivet PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1660c2916339SPierre Jolivet { 1661c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1662c2916339SPierre Jolivet PetscScalar *z = c; 1663c2916339SPierre Jolivet const PetscScalar *xb; 1664c2916339SPierre Jolivet PetscScalar x1,x2,x3,x4,x5; 1665c2916339SPierre Jolivet const MatScalar *v = a->a,*vv; 1666c2916339SPierre Jolivet PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1667c2916339SPierre Jolivet 1668c2916339SPierre Jolivet PetscFunctionBegin; 1669c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1670c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1671c2916339SPierre Jolivet PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1672c2916339SPierre Jolivet PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1673c2916339SPierre Jolivet jj = idx; 1674c2916339SPierre Jolivet vv = v; 1675c2916339SPierre Jolivet for (k=0; k<cn; k++) { 1676c2916339SPierre Jolivet idx = jj; 1677c2916339SPierre Jolivet v = vv; 1678c2916339SPierre Jolivet for (j=0; j<n; j++) { 1679c2916339SPierre Jolivet xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm]; 1680c2916339SPierre Jolivet z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1681c2916339SPierre Jolivet z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1682c2916339SPierre Jolivet z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1683c2916339SPierre Jolivet z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1684c2916339SPierre Jolivet z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1685c2916339SPierre Jolivet if (*idx != i) { 1686c2916339SPierre Jolivet c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm]; 1687c2916339SPierre Jolivet c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm]; 1688c2916339SPierre Jolivet c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm]; 1689c2916339SPierre Jolivet c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm]; 1690c2916339SPierre Jolivet c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm]; 1691c2916339SPierre Jolivet } 1692c2916339SPierre Jolivet v += 25; 1693c2916339SPierre Jolivet ++idx; 1694c2916339SPierre Jolivet } 1695c2916339SPierre Jolivet } 1696c2916339SPierre Jolivet z += 5; 1697c2916339SPierre Jolivet } 1698c2916339SPierre Jolivet PetscFunctionReturn(0); 1699c2916339SPierre Jolivet } 1700c2916339SPierre Jolivet 1701c2916339SPierre Jolivet PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C) 1702c2916339SPierre Jolivet { 1703c2916339SPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1704c2916339SPierre Jolivet Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1705281439baSStefano Zampini Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 1706c2916339SPierre Jolivet PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 1707c2916339SPierre Jolivet PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1708c2916339SPierre Jolivet PetscBLASInt bbs,bcn,bbm,bcm; 1709f4259b30SLisandro Dalcin PetscScalar *z = NULL; 1710c2916339SPierre Jolivet PetscScalar *c,*b; 1711c2916339SPierre Jolivet const MatScalar *v; 1712c2916339SPierre Jolivet const PetscInt *idx,*ii; 1713c2916339SPierre Jolivet PetscScalar _DOne=1.0; 1714c2916339SPierre Jolivet PetscErrorCode ierr; 1715c2916339SPierre Jolivet 1716c2916339SPierre Jolivet PetscFunctionBegin; 1717c2916339SPierre Jolivet if (!cm || !cn) PetscFunctionReturn(0); 1718*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 1719*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->rmap->n != C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 1720*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(B->cmap->n != C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 1721c2916339SPierre Jolivet b = bd->v; 1722c2916339SPierre Jolivet ierr = MatZeroEntries(C);CHKERRQ(ierr); 1723c2916339SPierre Jolivet ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr); 1724c2916339SPierre Jolivet switch (bs) { 1725c2916339SPierre Jolivet case 1: 17261e1ea65dSPierre Jolivet ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 1727c2916339SPierre Jolivet break; 1728c2916339SPierre Jolivet case 2: 17291e1ea65dSPierre Jolivet ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 1730c2916339SPierre Jolivet break; 1731c2916339SPierre Jolivet case 3: 17321e1ea65dSPierre Jolivet ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 1733c2916339SPierre Jolivet break; 1734c2916339SPierre Jolivet case 4: 17351e1ea65dSPierre Jolivet ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 1736c2916339SPierre Jolivet break; 1737c2916339SPierre Jolivet case 5: 17381e1ea65dSPierre Jolivet ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr); 1739c2916339SPierre Jolivet break; 1740c2916339SPierre Jolivet default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1741c2916339SPierre Jolivet ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr); 1742c2916339SPierre Jolivet ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr); 1743c2916339SPierre Jolivet ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr); 1744c2916339SPierre Jolivet ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr); 1745c2916339SPierre Jolivet idx = a->j; 1746c2916339SPierre Jolivet v = a->a; 1747c2916339SPierre Jolivet mbs = a->mbs; 1748c2916339SPierre Jolivet ii = a->i; 1749c2916339SPierre Jolivet z = c; 1750c2916339SPierre Jolivet for (i=0; i<mbs; i++) { 1751c2916339SPierre Jolivet n = ii[1] - ii[0]; ii++; 1752c2916339SPierre Jolivet for (j=0; j<n; j++) { 17536718818eSStefano Zampini if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm)); 1754c2916339SPierre Jolivet PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 1755c2916339SPierre Jolivet v += bs2; 1756c2916339SPierre Jolivet } 1757c2916339SPierre Jolivet z += bs; 1758c2916339SPierre Jolivet } 1759c2916339SPierre Jolivet } 1760c2916339SPierre Jolivet ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr); 1761c2916339SPierre Jolivet ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr); 1762c2916339SPierre Jolivet PetscFunctionReturn(0); 1763c2916339SPierre Jolivet } 1764