1cac129eeSSatish Balay 2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 306873bf2SBarry Smith #include <petsc-private/kernels/blockinvert.h> 4c6db04a5SJed Brown #include <petscbt.h> 5c6db04a5SJed Brown #include <petscblaslapack.h> 6cac129eeSSatish Balay 74a2ae208SSatish Balay #undef __FUNCT__ 84a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9690b6cddSBarry Smith PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10a3192f15SSatish Balay { 11a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 126849ba73SBarry Smith PetscErrorCode ierr; 135d0c19d7SBarry Smith PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 145d0c19d7SBarry Smith const PetscInt *idx; 15690b6cddSBarry Smith PetscInt start,end,*ai,*aj,bs,*nidx2; 16f1af5d2fSBarry Smith PetscBT table; 17a3192f15SSatish Balay 183a40ed3dSBarry Smith PetscFunctionBegin; 19a3192f15SSatish Balay m = a->mbs; 20a3192f15SSatish Balay ai = a->i; 21a3192f15SSatish Balay aj = a->j; 22d0f46423SBarry Smith bs = A->rmap->bs; 23a3192f15SSatish Balay 24e32f2f54SBarry Smith if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25a3192f15SSatish Balay 2653b8de81SBarry Smith ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 27690b6cddSBarry Smith ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 28d0f46423SBarry Smith ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 29a3192f15SSatish Balay 30a3192f15SSatish Balay for (i=0; i<is_max; i++) { 31a3192f15SSatish Balay /* Initialise the two local arrays */ 32a3192f15SSatish Balay isz = 0; 336831982aSBarry Smith ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34a3192f15SSatish Balay 35a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 36a3192f15SSatish Balay ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37b9b97703SBarry Smith ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38a3192f15SSatish Balay 39a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40a3192f15SSatish Balay for (j=0; j<n; ++j) { 41218c64b6SSatish Balay ival = idx[j]/bs; /* convert the indices into block indices */ 42e32f2f54SBarry Smith if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 4326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 44a3192f15SSatish Balay } 45a3192f15SSatish Balay ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 466bf464f9SBarry Smith ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47a3192f15SSatish Balay 48a3192f15SSatish Balay k = 0; 49a3192f15SSatish Balay for (j=0; j<ov; j++) { /* for each overlap*/ 50a3192f15SSatish Balay n = isz; 51a3192f15SSatish Balay for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52a3192f15SSatish Balay row = nidx[k]; 53a3192f15SSatish Balay start = ai[row]; 54a3192f15SSatish Balay end = ai[row+1]; 55a3192f15SSatish Balay for (l = start; l<end; l++) { 56a3192f15SSatish Balay val = aj[l]; 5726fbe8dcSKarl Rupp if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 58a3192f15SSatish Balay } 59a3192f15SSatish Balay } 60a3192f15SSatish Balay } 61218c64b6SSatish Balay /* expand the Index Set */ 62218c64b6SSatish Balay for (j=0; j<isz; j++) { 6326fbe8dcSKarl Rupp for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 64218c64b6SSatish Balay } 6570b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 66a3192f15SSatish Balay } 6794bacf5dSBarry Smith ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 68606d414cSSatish Balay ierr = PetscFree(nidx);CHKERRQ(ierr); 69606d414cSSatish Balay ierr = PetscFree(nidx2);CHKERRQ(ierr); 703a40ed3dSBarry Smith PetscFunctionReturn(0); 71a3192f15SSatish Balay } 721c351548SSatish Balay 734a2ae208SSatish Balay #undef __FUNCT__ 744a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 754aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76736121d4SSatish Balay { 77736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 786849ba73SBarry Smith PetscErrorCode ierr; 79690b6cddSBarry Smith PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80690b6cddSBarry Smith PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 815d0c19d7SBarry Smith const PetscInt *irow,*icol; 825d0c19d7SBarry Smith PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83690b6cddSBarry Smith PetscInt *aj = a->j,*ai = a->i; 843f1db9ecSBarry Smith MatScalar *mat_a; 85736121d4SSatish Balay Mat C; 86ace3abfcSBarry Smith PetscBool flag,sorted; 87736121d4SSatish Balay 883a40ed3dSBarry Smith PetscFunctionBegin; 8914ca34e6SBarry Smith ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 90e32f2f54SBarry Smith if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 91736121d4SSatish Balay 92736121d4SSatish Balay ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 93218c64b6SSatish Balay ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 94b9b97703SBarry Smith ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 95b9b97703SBarry Smith ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 96736121d4SSatish Balay 97690b6cddSBarry Smith ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 98736121d4SSatish Balay ssmap = smap; 99690b6cddSBarry Smith ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 100690b6cddSBarry Smith ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 101736121d4SSatish Balay for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 102736121d4SSatish Balay /* determine lens of each row */ 103736121d4SSatish Balay for (i=0; i<nrows; i++) { 104736121d4SSatish Balay kstart = ai[irow[i]]; 105736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 106736121d4SSatish Balay lens[i] = 0; 107736121d4SSatish Balay for (k=kstart; k<kend; k++) { 10826fbe8dcSKarl Rupp if (ssmap[aj[k]]) lens[i]++; 109736121d4SSatish Balay } 110736121d4SSatish Balay } 111736121d4SSatish Balay /* Create and fill new matrix */ 112736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 113736121d4SSatish Balay c = (Mat_SeqBAIJ*)((*B)->data); 114736121d4SSatish Balay 115e32f2f54SBarry Smith if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 116690b6cddSBarry Smith ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 117e7e72b3dSBarry Smith if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 118690b6cddSBarry Smith ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 119736121d4SSatish Balay C = *B; 1203a40ed3dSBarry Smith } else { 121ce94432eSBarry Smith ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 122f69a0ea3SMatthew Knepley ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1237adad957SLisandro Dalcin ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 124ab93d7beSBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 125736121d4SSatish Balay } 126736121d4SSatish Balay c = (Mat_SeqBAIJ*)(C->data); 127736121d4SSatish Balay for (i=0; i<nrows; i++) { 128736121d4SSatish Balay row = irow[i]; 129736121d4SSatish Balay kstart = ai[row]; 130736121d4SSatish Balay kend = kstart + a->ilen[row]; 131736121d4SSatish Balay mat_i = c->i[i]; 132736121d4SSatish Balay mat_j = c->j + mat_i; 133218c64b6SSatish Balay mat_a = c->a + mat_i*bs2; 134736121d4SSatish Balay mat_ilen = c->ilen + i; 135736121d4SSatish Balay for (k=kstart; k<kend; k++) { 136736121d4SSatish Balay if ((tcol=ssmap[a->j[k]])) { 137736121d4SSatish Balay *mat_j++ = tcol - 1; 138549d3d68SSatish Balay ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 139549d3d68SSatish Balay mat_a += bs2; 140736121d4SSatish Balay (*mat_ilen)++; 141736121d4SSatish Balay } 142736121d4SSatish Balay } 143736121d4SSatish Balay } 144218c64b6SSatish Balay 145736121d4SSatish Balay /* Free work space */ 146736121d4SSatish Balay ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 147606d414cSSatish Balay ierr = PetscFree(smap);CHKERRQ(ierr); 148606d414cSSatish Balay ierr = PetscFree(lens);CHKERRQ(ierr); 1496d4a8577SBarry Smith ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1506d4a8577SBarry Smith ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 151736121d4SSatish Balay 152736121d4SSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 153736121d4SSatish Balay *B = C; 1543a40ed3dSBarry Smith PetscFunctionReturn(0); 155736121d4SSatish Balay } 156736121d4SSatish Balay 1574a2ae208SSatish Balay #undef __FUNCT__ 1584a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 1594aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 160218c64b6SSatish Balay { 161218c64b6SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 162218c64b6SSatish Balay IS is1,is2; 1636849ba73SBarry Smith PetscErrorCode ierr; 1645d0c19d7SBarry Smith PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 1655d0c19d7SBarry Smith const PetscInt *irow,*icol; 166218c64b6SSatish Balay 1673a40ed3dSBarry Smith PetscFunctionBegin; 168218c64b6SSatish Balay ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 169218c64b6SSatish Balay ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 170b9b97703SBarry Smith ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 171b9b97703SBarry Smith ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 172218c64b6SSatish Balay 173218c64b6SSatish Balay /* Verify if the indices corespond to each element in a block 174218c64b6SSatish Balay and form the IS with compressed IS */ 175fca92195SBarry Smith ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 176fca92195SBarry Smith ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 177218c64b6SSatish Balay for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 178218c64b6SSatish Balay count = 0; 179218c64b6SSatish Balay for (i=0; i<a->mbs; i++) { 180e32f2f54SBarry Smith if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 181218c64b6SSatish Balay if (vary[i]==bs) iary[count++] = i; 182218c64b6SSatish Balay } 18370b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 184218c64b6SSatish Balay 185690b6cddSBarry Smith ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 186218c64b6SSatish Balay for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 187218c64b6SSatish Balay count = 0; 188218c64b6SSatish Balay for (i=0; i<a->mbs; i++) { 189e32f2f54SBarry Smith if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 190218c64b6SSatish Balay if (vary[i]==bs) iary[count++] = i; 191218c64b6SSatish Balay } 19270b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 193218c64b6SSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 194218c64b6SSatish Balay ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 195fca92195SBarry Smith ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 196218c64b6SSatish Balay 1974aa3045dSJed Brown ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 1986bf464f9SBarry Smith ierr = ISDestroy(&is1);CHKERRQ(ierr); 1996bf464f9SBarry Smith ierr = ISDestroy(&is2);CHKERRQ(ierr); 2003a40ed3dSBarry Smith PetscFunctionReturn(0); 201218c64b6SSatish Balay } 202218c64b6SSatish Balay 2034a2ae208SSatish Balay #undef __FUNCT__ 2044a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 205690b6cddSBarry Smith PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 206736121d4SSatish Balay { 2076849ba73SBarry Smith PetscErrorCode ierr; 208690b6cddSBarry Smith PetscInt i; 209736121d4SSatish Balay 2103a40ed3dSBarry Smith PetscFunctionBegin; 211736121d4SSatish Balay if (scall == MAT_INITIAL_MATRIX) { 21282502324SSatish Balay ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 213736121d4SSatish Balay } 214736121d4SSatish Balay 215736121d4SSatish Balay for (i=0; i<n; i++) { 2164aa3045dSJed Brown ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 217736121d4SSatish Balay } 2183a40ed3dSBarry Smith PetscFunctionReturn(0); 219736121d4SSatish Balay } 220218c64b6SSatish Balay 221218c64b6SSatish Balay 2222d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2232d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 2242d61bbb3SSatish Balay /* -------------------------------------------------------*/ 2252d61bbb3SSatish Balay 2264a2ae208SSatish Balay #undef __FUNCT__ 2274a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_1" 228dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 2292d61bbb3SSatish Balay { 2302d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 231d9fead3dSBarry Smith PetscScalar *z,sum; 232d9fead3dSBarry Smith const PetscScalar *x; 233d9fead3dSBarry Smith const MatScalar *v; 2346849ba73SBarry Smith PetscErrorCode ierr; 235*7c565772SBarry Smith PetscInt mbs,i,n; 2360298fd71SBarry Smith const PetscInt *idx,*ii,*ridx=NULL; 237ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 2382d61bbb3SSatish Balay 2392d61bbb3SSatish Balay PetscFunctionBegin; 2403649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2411ebc52fbSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2422d61bbb3SSatish Balay 24326e093fcSHong Zhang if (usecprow) { 24426e093fcSHong Zhang mbs = a->compressedrow.nrows; 24526e093fcSHong Zhang ii = a->compressedrow.i; 2467b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 247a1c3900fSBarry Smith ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 24826e093fcSHong Zhang } else { 24926e093fcSHong Zhang mbs = a->mbs; 2502d61bbb3SSatish Balay ii = a->i; 25126e093fcSHong Zhang } 2522d61bbb3SSatish Balay 2532d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 254ee54c7eeSHong Zhang n = ii[1] - ii[0]; 255ee54c7eeSHong Zhang v = a->a + ii[0]; 256ee54c7eeSHong Zhang idx = a->j + ii[0]; 257ee54c7eeSHong Zhang ii++; 258444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 259444d8c10SJed Brown PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2602d61bbb3SSatish Balay sum = 0.0; 2612162cab8SBarry Smith PetscSparseDensePlusDot(sum,x,v,idx,n); 26226e093fcSHong Zhang if (usecprow) { 2637b2bb3b9SHong Zhang z[ridx[i]] = sum; 26426e093fcSHong Zhang } else { 2652d61bbb3SSatish Balay z[i] = sum; 2662d61bbb3SSatish Balay } 26726e093fcSHong Zhang } 2683649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2691ebc52fbSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 270*7c565772SBarry Smith ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 2712d61bbb3SSatish Balay PetscFunctionReturn(0); 2722d61bbb3SSatish Balay } 2732d61bbb3SSatish Balay 2744a2ae208SSatish Balay #undef __FUNCT__ 2754a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_2" 276dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 2772d61bbb3SSatish Balay { 2782d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 279d9fead3dSBarry Smith PetscScalar *z = 0,sum1,sum2,*zarray; 280d9fead3dSBarry Smith const PetscScalar *x,*xb; 28187828ca2SBarry Smith PetscScalar x1,x2; 282d9fead3dSBarry Smith const MatScalar *v; 283dfbe8321SBarry Smith PetscErrorCode ierr; 284*7c565772SBarry Smith PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 285ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 2862d61bbb3SSatish Balay 2872d61bbb3SSatish Balay PetscFunctionBegin; 2883649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 28926e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2902d61bbb3SSatish Balay 2912d61bbb3SSatish Balay idx = a->j; 2922d61bbb3SSatish Balay v = a->a; 29326e093fcSHong Zhang if (usecprow) { 29426e093fcSHong Zhang mbs = a->compressedrow.nrows; 29526e093fcSHong Zhang ii = a->compressedrow.i; 2967b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 29726e093fcSHong Zhang } else { 29826e093fcSHong Zhang mbs = a->mbs; 2992d61bbb3SSatish Balay ii = a->i; 30026e093fcSHong Zhang z = zarray; 30126e093fcSHong Zhang } 3022d61bbb3SSatish Balay 3032d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 3042d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 3052d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; 306444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 307444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3082d61bbb3SSatish Balay for (j=0; j<n; j++) { 3092d61bbb3SSatish Balay xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 3102d61bbb3SSatish Balay sum1 += v[0]*x1 + v[2]*x2; 3112d61bbb3SSatish Balay sum2 += v[1]*x1 + v[3]*x2; 3122d61bbb3SSatish Balay v += 4; 3132d61bbb3SSatish Balay } 3147b2bb3b9SHong Zhang if (usecprow) z = zarray + 2*ridx[i]; 3152d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; 31626e093fcSHong Zhang if (!usecprow) z += 2; 3172d61bbb3SSatish Balay } 3183649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 31926e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 320*7c565772SBarry Smith ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 3212d61bbb3SSatish Balay PetscFunctionReturn(0); 3222d61bbb3SSatish Balay } 3232d61bbb3SSatish Balay 3244a2ae208SSatish Balay #undef __FUNCT__ 3254a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_3" 326dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 3272d61bbb3SSatish Balay { 3282d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 329d9fead3dSBarry Smith PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 330d9fead3dSBarry Smith const PetscScalar *x,*xb; 331d9fead3dSBarry Smith const MatScalar *v; 332dfbe8321SBarry Smith PetscErrorCode ierr; 333*7c565772SBarry Smith PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 334ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 33526e093fcSHong Zhang 3362d61bbb3SSatish Balay 337b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 338fee21e36SBarry Smith #pragma disjoint(*v,*z,*xb) 339fee21e36SBarry Smith #endif 340fee21e36SBarry Smith 3412d61bbb3SSatish Balay PetscFunctionBegin; 3423649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 34326e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 3442d61bbb3SSatish Balay 3452d61bbb3SSatish Balay idx = a->j; 3462d61bbb3SSatish Balay v = a->a; 34726e093fcSHong Zhang if (usecprow) { 34826e093fcSHong Zhang mbs = a->compressedrow.nrows; 34926e093fcSHong Zhang ii = a->compressedrow.i; 3507b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 35126e093fcSHong Zhang } else { 35226e093fcSHong Zhang mbs = a->mbs; 3532d61bbb3SSatish Balay ii = a->i; 35426e093fcSHong Zhang z = zarray; 35526e093fcSHong Zhang } 3562d61bbb3SSatish Balay 3572d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 3582d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 3592d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 360444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 361444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3622d61bbb3SSatish Balay for (j=0; j<n; j++) { 36326fbe8dcSKarl Rupp xb = x + 3*(*idx++); 36426fbe8dcSKarl Rupp x1 = xb[0]; 36526fbe8dcSKarl Rupp x2 = xb[1]; 36626fbe8dcSKarl Rupp x3 = xb[2]; 36726fbe8dcSKarl Rupp 3682d61bbb3SSatish Balay sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 3692d61bbb3SSatish Balay sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 3702d61bbb3SSatish Balay sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 3712d61bbb3SSatish Balay v += 9; 3722d61bbb3SSatish Balay } 3737b2bb3b9SHong Zhang if (usecprow) z = zarray + 3*ridx[i]; 3742d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; 37526e093fcSHong Zhang if (!usecprow) z += 3; 3762d61bbb3SSatish Balay } 3773649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 37826e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 379*7c565772SBarry Smith ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 3802d61bbb3SSatish Balay PetscFunctionReturn(0); 3812d61bbb3SSatish Balay } 3822d61bbb3SSatish Balay 3834a2ae208SSatish Balay #undef __FUNCT__ 3844a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_4" 385dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 3862d61bbb3SSatish Balay { 3872d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 388d9fead3dSBarry Smith PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 389d9fead3dSBarry Smith const PetscScalar *x,*xb; 390d9fead3dSBarry Smith const MatScalar *v; 391dfbe8321SBarry Smith PetscErrorCode ierr; 392*7c565772SBarry Smith PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 393ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 3942d61bbb3SSatish Balay 3952d61bbb3SSatish Balay PetscFunctionBegin; 3963649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 39726e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 3982d61bbb3SSatish Balay 3992d61bbb3SSatish Balay idx = a->j; 4002d61bbb3SSatish Balay v = a->a; 40126e093fcSHong Zhang if (usecprow) { 40226e093fcSHong Zhang mbs = a->compressedrow.nrows; 40326e093fcSHong Zhang ii = a->compressedrow.i; 4047b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 40526e093fcSHong Zhang } else { 40626e093fcSHong Zhang mbs = a->mbs; 4072d61bbb3SSatish Balay ii = a->i; 40826e093fcSHong Zhang z = zarray; 40926e093fcSHong Zhang } 4102d61bbb3SSatish Balay 4112d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 41226fbe8dcSKarl Rupp n = ii[1] - ii[0]; 41326fbe8dcSKarl Rupp ii++; 41426fbe8dcSKarl Rupp sum1 = 0.0; 41526fbe8dcSKarl Rupp sum2 = 0.0; 41626fbe8dcSKarl Rupp sum3 = 0.0; 41726fbe8dcSKarl Rupp sum4 = 0.0; 41826fbe8dcSKarl Rupp 419444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 420444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4212d61bbb3SSatish Balay for (j=0; j<n; j++) { 4222d61bbb3SSatish Balay xb = x + 4*(*idx++); 4232d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 4242d61bbb3SSatish Balay sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4252d61bbb3SSatish Balay sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4262d61bbb3SSatish Balay sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4272d61bbb3SSatish Balay sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4282d61bbb3SSatish Balay v += 16; 4292d61bbb3SSatish Balay } 4307b2bb3b9SHong Zhang if (usecprow) z = zarray + 4*ridx[i]; 4312d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 43226e093fcSHong Zhang if (!usecprow) z += 4; 4332d61bbb3SSatish Balay } 4343649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 43526e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 436*7c565772SBarry Smith ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 4372d61bbb3SSatish Balay PetscFunctionReturn(0); 4382d61bbb3SSatish Balay } 4392d61bbb3SSatish Balay 4404a2ae208SSatish Balay #undef __FUNCT__ 4414a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_5" 442dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 4432d61bbb3SSatish Balay { 4442d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 445d9fead3dSBarry Smith PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 446d9fead3dSBarry Smith const PetscScalar *xb,*x; 447d9fead3dSBarry Smith const MatScalar *v; 448dfbe8321SBarry Smith PetscErrorCode ierr; 4490298fd71SBarry Smith const PetscInt *idx,*ii,*ridx=NULL; 450*7c565772SBarry Smith PetscInt mbs,i,j,n; 451ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 4522d61bbb3SSatish Balay 453433994e6SBarry Smith PetscFunctionBegin; 4543649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 45526e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 4562d61bbb3SSatish Balay 4572d61bbb3SSatish Balay idx = a->j; 4582d61bbb3SSatish Balay v = a->a; 45926e093fcSHong Zhang if (usecprow) { 46026e093fcSHong Zhang mbs = a->compressedrow.nrows; 46126e093fcSHong Zhang ii = a->compressedrow.i; 4627b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 46326e093fcSHong Zhang } else { 46426e093fcSHong Zhang mbs = a->mbs; 4652d61bbb3SSatish Balay ii = a->i; 46626e093fcSHong Zhang z = zarray; 46726e093fcSHong Zhang } 4682d61bbb3SSatish Balay 4692d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 4702d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 4712d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 472444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 473444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 4742d61bbb3SSatish Balay for (j=0; j<n; j++) { 4752d61bbb3SSatish Balay xb = x + 5*(*idx++); 4762d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 4772d61bbb3SSatish Balay sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 4782d61bbb3SSatish Balay sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 4792d61bbb3SSatish Balay sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 4802d61bbb3SSatish Balay sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 4812d61bbb3SSatish Balay sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 4822d61bbb3SSatish Balay v += 25; 4832d61bbb3SSatish Balay } 4847b2bb3b9SHong Zhang if (usecprow) z = zarray + 5*ridx[i]; 4852d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 48626e093fcSHong Zhang if (!usecprow) z += 5; 4872d61bbb3SSatish Balay } 4883649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 48926e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 490*7c565772SBarry Smith ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 4912d61bbb3SSatish Balay PetscFunctionReturn(0); 4922d61bbb3SSatish Balay } 4932d61bbb3SSatish Balay 49415091d37SBarry Smith 4954a2ae208SSatish Balay #undef __FUNCT__ 4964a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_6" 497dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 49815091d37SBarry Smith { 49915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 500d9fead3dSBarry Smith PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 501d9fead3dSBarry Smith const PetscScalar *x,*xb; 50226e093fcSHong Zhang PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 503d9fead3dSBarry Smith const MatScalar *v; 504dfbe8321SBarry Smith PetscErrorCode ierr; 505*7c565772SBarry Smith PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 506ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 50715091d37SBarry Smith 508433994e6SBarry Smith PetscFunctionBegin; 5093649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 51026e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 51115091d37SBarry Smith 51215091d37SBarry Smith idx = a->j; 51315091d37SBarry Smith v = a->a; 51426e093fcSHong Zhang if (usecprow) { 51526e093fcSHong Zhang mbs = a->compressedrow.nrows; 51626e093fcSHong Zhang ii = a->compressedrow.i; 5177b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 51826e093fcSHong Zhang } else { 51926e093fcSHong Zhang mbs = a->mbs; 52015091d37SBarry Smith ii = a->i; 52126e093fcSHong Zhang z = zarray; 52226e093fcSHong Zhang } 52315091d37SBarry Smith 52415091d37SBarry Smith for (i=0; i<mbs; i++) { 52526fbe8dcSKarl Rupp n = ii[1] - ii[0]; 52626fbe8dcSKarl Rupp ii++; 52726fbe8dcSKarl Rupp sum1 = 0.0; 52826fbe8dcSKarl Rupp sum2 = 0.0; 52926fbe8dcSKarl Rupp sum3 = 0.0; 53026fbe8dcSKarl Rupp sum4 = 0.0; 53126fbe8dcSKarl Rupp sum5 = 0.0; 53226fbe8dcSKarl Rupp sum6 = 0.0; 53326fbe8dcSKarl Rupp 534444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 535444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 53615091d37SBarry Smith for (j=0; j<n; j++) { 53715091d37SBarry Smith xb = x + 6*(*idx++); 53815091d37SBarry Smith x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 53915091d37SBarry Smith sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 54015091d37SBarry Smith sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 54115091d37SBarry Smith sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 54215091d37SBarry Smith sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 54315091d37SBarry Smith sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 54415091d37SBarry Smith sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 54515091d37SBarry Smith v += 36; 54615091d37SBarry Smith } 5477b2bb3b9SHong Zhang if (usecprow) z = zarray + 6*ridx[i]; 54815091d37SBarry Smith z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 54926e093fcSHong Zhang if (!usecprow) z += 6; 55015091d37SBarry Smith } 55115091d37SBarry Smith 5523649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 55326e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 554*7c565772SBarry Smith ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 55515091d37SBarry Smith PetscFunctionReturn(0); 55615091d37SBarry Smith } 5578ab949d8SShri Abhyankar 5584a2ae208SSatish Balay #undef __FUNCT__ 5594a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_7" 560dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 5612d61bbb3SSatish Balay { 5622d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 563d9fead3dSBarry Smith PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 564d9fead3dSBarry Smith const PetscScalar *x,*xb; 56526e093fcSHong Zhang PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 566d9fead3dSBarry Smith const MatScalar *v; 567dfbe8321SBarry Smith PetscErrorCode ierr; 568*7c565772SBarry Smith PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 569ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 5702d61bbb3SSatish Balay 571433994e6SBarry Smith PetscFunctionBegin; 5723649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 57326e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 5742d61bbb3SSatish Balay 5752d61bbb3SSatish Balay idx = a->j; 5762d61bbb3SSatish Balay v = a->a; 57726e093fcSHong Zhang if (usecprow) { 57826e093fcSHong Zhang mbs = a->compressedrow.nrows; 57926e093fcSHong Zhang ii = a->compressedrow.i; 5807b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 58126e093fcSHong Zhang } else { 58226e093fcSHong Zhang mbs = a->mbs; 5832d61bbb3SSatish Balay ii = a->i; 58426e093fcSHong Zhang z = zarray; 58526e093fcSHong Zhang } 5862d61bbb3SSatish Balay 5872d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 58826fbe8dcSKarl Rupp n = ii[1] - ii[0]; 58926fbe8dcSKarl Rupp ii++; 59026fbe8dcSKarl Rupp sum1 = 0.0; 59126fbe8dcSKarl Rupp sum2 = 0.0; 59226fbe8dcSKarl Rupp sum3 = 0.0; 59326fbe8dcSKarl Rupp sum4 = 0.0; 59426fbe8dcSKarl Rupp sum5 = 0.0; 59526fbe8dcSKarl Rupp sum6 = 0.0; 59626fbe8dcSKarl Rupp sum7 = 0.0; 59726fbe8dcSKarl Rupp 598444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 599444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 6002d61bbb3SSatish Balay for (j=0; j<n; j++) { 6012d61bbb3SSatish Balay xb = x + 7*(*idx++); 6022d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 6032d61bbb3SSatish Balay sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 6042d61bbb3SSatish Balay sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 6052d61bbb3SSatish Balay sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 6062d61bbb3SSatish Balay sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 6072d61bbb3SSatish Balay sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 6082d61bbb3SSatish Balay sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 6092d61bbb3SSatish Balay sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 6102d61bbb3SSatish Balay v += 49; 6112d61bbb3SSatish Balay } 6127b2bb3b9SHong Zhang if (usecprow) z = zarray + 7*ridx[i]; 6132d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 61426e093fcSHong Zhang if (!usecprow) z += 7; 6152d61bbb3SSatish Balay } 6162d61bbb3SSatish Balay 6173649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 61826e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 619*7c565772SBarry Smith ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 6202d61bbb3SSatish Balay PetscFunctionReturn(0); 6212d61bbb3SSatish Balay } 6222d61bbb3SSatish Balay 6238ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 624832cc040SShri Abhyankar /* Default MatMult for block size 15 */ 6258ab949d8SShri Abhyankar 6268ab949d8SShri Abhyankar #undef __FUNCT__ 6278ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 6288ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 6298ab949d8SShri Abhyankar { 6308ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 6318ab949d8SShri Abhyankar PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 6328ab949d8SShri Abhyankar const PetscScalar *x,*xb; 63353ef36baSBarry Smith PetscScalar *zarray,xv; 6348ab949d8SShri Abhyankar const MatScalar *v; 6358ab949d8SShri Abhyankar PetscErrorCode ierr; 6368ab949d8SShri Abhyankar const PetscInt *ii,*ij=a->j,*idx; 637*7c565772SBarry Smith PetscInt mbs,i,j,k,n,*ridx=NULL; 638ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 6398ab949d8SShri Abhyankar 6408ab949d8SShri Abhyankar PetscFunctionBegin; 6413649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 6428ab949d8SShri Abhyankar ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 6438ab949d8SShri Abhyankar 6448ab949d8SShri Abhyankar v = a->a; 6458ab949d8SShri Abhyankar if (usecprow) { 6468ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 6478ab949d8SShri Abhyankar ii = a->compressedrow.i; 6488ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 6498ab949d8SShri Abhyankar } else { 6508ab949d8SShri Abhyankar mbs = a->mbs; 6518ab949d8SShri Abhyankar ii = a->i; 6528ab949d8SShri Abhyankar z = zarray; 6538ab949d8SShri Abhyankar } 6548ab949d8SShri Abhyankar 6558ab949d8SShri Abhyankar for (i=0; i<mbs; i++) { 6568ab949d8SShri Abhyankar n = ii[i+1] - ii[i]; 6578ab949d8SShri Abhyankar idx = ij + ii[i]; 6588ab949d8SShri Abhyankar sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 6598ab949d8SShri Abhyankar sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 6608ab949d8SShri Abhyankar 6618ab949d8SShri Abhyankar for (j=0; j<n; j++) { 6628ab949d8SShri Abhyankar xb = x + 15*(idx[j]); 6638ab949d8SShri Abhyankar 6648ab949d8SShri Abhyankar for (k=0; k<15; k++) { 66553ef36baSBarry Smith xv = xb[k]; 66653ef36baSBarry Smith sum1 += v[0]*xv; 66753ef36baSBarry Smith sum2 += v[1]*xv; 66853ef36baSBarry Smith sum3 += v[2]*xv; 66953ef36baSBarry Smith sum4 += v[3]*xv; 67053ef36baSBarry Smith sum5 += v[4]*xv; 67153ef36baSBarry Smith sum6 += v[5]*xv; 67253ef36baSBarry Smith sum7 += v[6]*xv; 67353ef36baSBarry Smith sum8 += v[7]*xv; 67453ef36baSBarry Smith sum9 += v[8]*xv; 67553ef36baSBarry Smith sum10 += v[9]*xv; 67653ef36baSBarry Smith sum11 += v[10]*xv; 67753ef36baSBarry Smith sum12 += v[11]*xv; 67853ef36baSBarry Smith sum13 += v[12]*xv; 67953ef36baSBarry Smith sum14 += v[13]*xv; 68053ef36baSBarry Smith sum15 += v[14]*xv; 6818ab949d8SShri Abhyankar v += 15; 6828ab949d8SShri Abhyankar } 6838ab949d8SShri Abhyankar } 6848ab949d8SShri Abhyankar if (usecprow) z = zarray + 15*ridx[i]; 6858ab949d8SShri Abhyankar z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 6868ab949d8SShri Abhyankar z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 6878ab949d8SShri Abhyankar 6888ab949d8SShri Abhyankar if (!usecprow) z += 15; 6898ab949d8SShri Abhyankar } 6908ab949d8SShri Abhyankar 6913649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 6928ab949d8SShri Abhyankar ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 693*7c565772SBarry Smith ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 6948ab949d8SShri Abhyankar PetscFunctionReturn(0); 6958ab949d8SShri Abhyankar } 6968ab949d8SShri Abhyankar 6978ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 6988ab949d8SShri Abhyankar #undef __FUNCT__ 6998ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 7008ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 7018ab949d8SShri Abhyankar { 7028ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 7038ab949d8SShri Abhyankar PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 7048ab949d8SShri Abhyankar const PetscScalar *x,*xb; 7050b8f6341SShri Abhyankar PetscScalar x1,x2,x3,x4,*zarray; 7068ab949d8SShri Abhyankar const MatScalar *v; 7078ab949d8SShri Abhyankar PetscErrorCode ierr; 7088ab949d8SShri Abhyankar const PetscInt *ii,*ij=a->j,*idx; 709*7c565772SBarry Smith PetscInt mbs,i,j,n,*ridx=NULL; 710ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 7118ab949d8SShri Abhyankar 7128ab949d8SShri Abhyankar PetscFunctionBegin; 7133649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 7148ab949d8SShri Abhyankar ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 7158ab949d8SShri Abhyankar 7168ab949d8SShri Abhyankar v = a->a; 7178ab949d8SShri Abhyankar if (usecprow) { 7188ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 7198ab949d8SShri Abhyankar ii = a->compressedrow.i; 7208ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 7218ab949d8SShri Abhyankar } else { 7228ab949d8SShri Abhyankar mbs = a->mbs; 7238ab949d8SShri Abhyankar ii = a->i; 7248ab949d8SShri Abhyankar z = zarray; 7258ab949d8SShri Abhyankar } 7268ab949d8SShri Abhyankar 7278ab949d8SShri Abhyankar for (i=0; i<mbs; i++) { 7288ab949d8SShri Abhyankar n = ii[i+1] - ii[i]; 7298ab949d8SShri Abhyankar idx = ij + ii[i]; 7308ab949d8SShri Abhyankar sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 7318ab949d8SShri Abhyankar sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 7328ab949d8SShri Abhyankar 7338ab949d8SShri Abhyankar for (j=0; j<n; j++) { 7348ab949d8SShri Abhyankar xb = x + 15*(idx[j]); 7350b8f6341SShri Abhyankar x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 7368ab949d8SShri Abhyankar 7378ab949d8SShri Abhyankar sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 7388ab949d8SShri Abhyankar sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 7398ab949d8SShri Abhyankar sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 7408ab949d8SShri Abhyankar sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 7418ab949d8SShri Abhyankar sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 7428ab949d8SShri Abhyankar sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 7438ab949d8SShri Abhyankar sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 7448ab949d8SShri Abhyankar sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 7458ab949d8SShri Abhyankar sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 7468ab949d8SShri Abhyankar sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 7478ab949d8SShri Abhyankar sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 7488ab949d8SShri Abhyankar sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 7498ab949d8SShri Abhyankar sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 7508ab949d8SShri Abhyankar sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 7518ab949d8SShri Abhyankar sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 7528ab949d8SShri Abhyankar 7538ab949d8SShri Abhyankar v += 60; 7548ab949d8SShri Abhyankar 7550b8f6341SShri Abhyankar x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 7568ab949d8SShri Abhyankar 7578ab949d8SShri Abhyankar sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 7588ab949d8SShri Abhyankar sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 7598ab949d8SShri Abhyankar sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 7608ab949d8SShri Abhyankar sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 7618ab949d8SShri Abhyankar sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 7628ab949d8SShri Abhyankar sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 7638ab949d8SShri Abhyankar sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 7648ab949d8SShri Abhyankar sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 7658ab949d8SShri Abhyankar sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 7668ab949d8SShri Abhyankar sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 7678ab949d8SShri Abhyankar sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 7688ab949d8SShri Abhyankar sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 7698ab949d8SShri Abhyankar sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 7708ab949d8SShri Abhyankar sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 7718ab949d8SShri Abhyankar sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 7728ab949d8SShri Abhyankar v += 60; 7738ab949d8SShri Abhyankar 7740b8f6341SShri Abhyankar x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 7750b8f6341SShri Abhyankar sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 7760b8f6341SShri Abhyankar sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 7770b8f6341SShri Abhyankar sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 7780b8f6341SShri Abhyankar sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 7790b8f6341SShri Abhyankar sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 7800b8f6341SShri Abhyankar sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 7810b8f6341SShri Abhyankar sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 7820b8f6341SShri Abhyankar sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 7830b8f6341SShri Abhyankar sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 7840b8f6341SShri Abhyankar sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 7850b8f6341SShri Abhyankar sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 7860b8f6341SShri Abhyankar sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 7870b8f6341SShri Abhyankar sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 7880b8f6341SShri Abhyankar sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 7890b8f6341SShri Abhyankar sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 7900b8f6341SShri Abhyankar v += 60; 7910b8f6341SShri Abhyankar 7920b8f6341SShri Abhyankar x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 7938ab949d8SShri Abhyankar sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 7948ab949d8SShri Abhyankar sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 7958ab949d8SShri Abhyankar sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 7968ab949d8SShri Abhyankar sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 7978ab949d8SShri Abhyankar sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 7988ab949d8SShri Abhyankar sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 7998ab949d8SShri Abhyankar sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 8008ab949d8SShri Abhyankar sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 8018ab949d8SShri Abhyankar sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 8028ab949d8SShri Abhyankar sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 8038ab949d8SShri Abhyankar sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 8048ab949d8SShri Abhyankar sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 8058ab949d8SShri Abhyankar sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 8068ab949d8SShri Abhyankar sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 8078ab949d8SShri Abhyankar sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 8088ab949d8SShri Abhyankar v += 45; 8098ab949d8SShri Abhyankar } 8108ab949d8SShri Abhyankar if (usecprow) z = zarray + 15*ridx[i]; 8118ab949d8SShri Abhyankar z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 8128ab949d8SShri Abhyankar z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 8138ab949d8SShri Abhyankar 8148ab949d8SShri Abhyankar if (!usecprow) z += 15; 8158ab949d8SShri Abhyankar } 8168ab949d8SShri Abhyankar 8173649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 8188ab949d8SShri Abhyankar ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 819*7c565772SBarry Smith ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 8208ab949d8SShri Abhyankar PetscFunctionReturn(0); 8218ab949d8SShri Abhyankar } 8228ab949d8SShri Abhyankar 8238ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 8248ab949d8SShri Abhyankar #undef __FUNCT__ 8258ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 8268ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 8278ab949d8SShri Abhyankar { 8288ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 8298ab949d8SShri Abhyankar PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 8308ab949d8SShri Abhyankar const PetscScalar *x,*xb; 8310b8f6341SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 8328ab949d8SShri Abhyankar const MatScalar *v; 8338ab949d8SShri Abhyankar PetscErrorCode ierr; 8348ab949d8SShri Abhyankar const PetscInt *ii,*ij=a->j,*idx; 835*7c565772SBarry Smith PetscInt mbs,i,j,n,*ridx=NULL; 836ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 8378ab949d8SShri Abhyankar 8388ab949d8SShri Abhyankar PetscFunctionBegin; 8393649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 8408ab949d8SShri Abhyankar ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 8418ab949d8SShri Abhyankar 8428ab949d8SShri Abhyankar v = a->a; 8438ab949d8SShri Abhyankar if (usecprow) { 8448ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 8458ab949d8SShri Abhyankar ii = a->compressedrow.i; 8468ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 8478ab949d8SShri Abhyankar } else { 8488ab949d8SShri Abhyankar mbs = a->mbs; 8498ab949d8SShri Abhyankar ii = a->i; 8508ab949d8SShri Abhyankar z = zarray; 8518ab949d8SShri Abhyankar } 8528ab949d8SShri Abhyankar 8538ab949d8SShri Abhyankar for (i=0; i<mbs; i++) { 8548ab949d8SShri Abhyankar n = ii[i+1] - ii[i]; 8558ab949d8SShri Abhyankar idx = ij + ii[i]; 8568ab949d8SShri Abhyankar sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 8578ab949d8SShri Abhyankar sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 8588ab949d8SShri Abhyankar 8598ab949d8SShri Abhyankar for (j=0; j<n; j++) { 8608ab949d8SShri Abhyankar xb = x + 15*(idx[j]); 8618ab949d8SShri Abhyankar x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 8620b8f6341SShri Abhyankar x8 = xb[7]; 8638ab949d8SShri Abhyankar 8648ab949d8SShri Abhyankar sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 8658ab949d8SShri Abhyankar sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 8668ab949d8SShri Abhyankar sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 8678ab949d8SShri Abhyankar sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 8688ab949d8SShri Abhyankar sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 8698ab949d8SShri Abhyankar sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 8708ab949d8SShri Abhyankar sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 8718ab949d8SShri Abhyankar sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 8728ab949d8SShri Abhyankar sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 8738ab949d8SShri Abhyankar sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 8748ab949d8SShri Abhyankar sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 8758ab949d8SShri Abhyankar sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 8768ab949d8SShri Abhyankar sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 8778ab949d8SShri Abhyankar sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 8788ab949d8SShri Abhyankar sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 8798ab949d8SShri Abhyankar v += 120; 8808ab949d8SShri Abhyankar 8810b8f6341SShri Abhyankar x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 8820b8f6341SShri Abhyankar 8838ab949d8SShri Abhyankar sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 8848ab949d8SShri Abhyankar sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 8858ab949d8SShri Abhyankar sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 8868ab949d8SShri Abhyankar sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 8878ab949d8SShri Abhyankar sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 8888ab949d8SShri Abhyankar sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 8898ab949d8SShri Abhyankar sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 8908ab949d8SShri Abhyankar sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 8918ab949d8SShri Abhyankar sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 8928ab949d8SShri Abhyankar sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 8938ab949d8SShri Abhyankar sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 8948ab949d8SShri Abhyankar sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 8958ab949d8SShri Abhyankar sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 8968ab949d8SShri Abhyankar sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 8978ab949d8SShri Abhyankar sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 8988ab949d8SShri Abhyankar v += 105; 8998ab949d8SShri Abhyankar } 9008ab949d8SShri Abhyankar if (usecprow) z = zarray + 15*ridx[i]; 9018ab949d8SShri Abhyankar z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 9028ab949d8SShri Abhyankar z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 9038ab949d8SShri Abhyankar 9048ab949d8SShri Abhyankar if (!usecprow) z += 15; 9058ab949d8SShri Abhyankar } 9068ab949d8SShri Abhyankar 9073649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 9088ab949d8SShri Abhyankar ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 909*7c565772SBarry Smith ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 9108ab949d8SShri Abhyankar PetscFunctionReturn(0); 9118ab949d8SShri Abhyankar } 9128ab949d8SShri Abhyankar 9138ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 9148ab949d8SShri Abhyankar 9158ab949d8SShri Abhyankar #undef __FUNCT__ 9168ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 9178ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 9188ab949d8SShri Abhyankar { 9198ab949d8SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 9208ab949d8SShri Abhyankar PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 9218ab949d8SShri Abhyankar const PetscScalar *x,*xb; 9228ab949d8SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 9238ab949d8SShri Abhyankar const MatScalar *v; 9248ab949d8SShri Abhyankar PetscErrorCode ierr; 9258ab949d8SShri Abhyankar const PetscInt *ii,*ij=a->j,*idx; 926*7c565772SBarry Smith PetscInt mbs,i,j,n,*ridx=NULL; 927ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 9288ab949d8SShri Abhyankar 9298ab949d8SShri Abhyankar PetscFunctionBegin; 9303649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 9318ab949d8SShri Abhyankar ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 9328ab949d8SShri Abhyankar 9338ab949d8SShri Abhyankar v = a->a; 9348ab949d8SShri Abhyankar if (usecprow) { 9358ab949d8SShri Abhyankar mbs = a->compressedrow.nrows; 9368ab949d8SShri Abhyankar ii = a->compressedrow.i; 9378ab949d8SShri Abhyankar ridx = a->compressedrow.rindex; 9388ab949d8SShri Abhyankar } else { 9398ab949d8SShri Abhyankar mbs = a->mbs; 9408ab949d8SShri Abhyankar ii = a->i; 9418ab949d8SShri Abhyankar z = zarray; 9428ab949d8SShri Abhyankar } 9438ab949d8SShri Abhyankar 9448ab949d8SShri Abhyankar for (i=0; i<mbs; i++) { 9458ab949d8SShri Abhyankar n = ii[i+1] - ii[i]; 9468ab949d8SShri Abhyankar idx = ij + ii[i]; 9478ab949d8SShri Abhyankar sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 9488ab949d8SShri Abhyankar sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 9498ab949d8SShri Abhyankar 9508ab949d8SShri Abhyankar for (j=0; j<n; j++) { 9518ab949d8SShri Abhyankar xb = x + 15*(idx[j]); 9528ab949d8SShri Abhyankar x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 9538ab949d8SShri Abhyankar x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 9548ab949d8SShri Abhyankar 9558ab949d8SShri Abhyankar sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 9568ab949d8SShri Abhyankar sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 9578ab949d8SShri Abhyankar sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 9588ab949d8SShri Abhyankar sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 9598ab949d8SShri Abhyankar sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 9608ab949d8SShri Abhyankar sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 9618ab949d8SShri Abhyankar sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 9628ab949d8SShri Abhyankar sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 9638ab949d8SShri Abhyankar sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 9648ab949d8SShri Abhyankar sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 9658ab949d8SShri Abhyankar sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 9668ab949d8SShri Abhyankar sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 9678ab949d8SShri Abhyankar sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 9688ab949d8SShri Abhyankar sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 9698ab949d8SShri Abhyankar sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 9708ab949d8SShri Abhyankar v += 225; 9718ab949d8SShri Abhyankar } 9728ab949d8SShri Abhyankar if (usecprow) z = zarray + 15*ridx[i]; 9738ab949d8SShri Abhyankar z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 9748ab949d8SShri Abhyankar z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 9758ab949d8SShri Abhyankar 9768ab949d8SShri Abhyankar if (!usecprow) z += 15; 9778ab949d8SShri Abhyankar } 9788ab949d8SShri Abhyankar 9793649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 9808ab949d8SShri Abhyankar ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 981*7c565772SBarry Smith ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 9828ab949d8SShri Abhyankar PetscFunctionReturn(0); 9838ab949d8SShri Abhyankar } 9848ab949d8SShri Abhyankar 9858ab949d8SShri Abhyankar 9863f1db9ecSBarry Smith /* 9873f1db9ecSBarry Smith This will not work with MatScalar == float because it calls the BLAS 9883f1db9ecSBarry Smith */ 9894a2ae208SSatish Balay #undef __FUNCT__ 9904a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_N" 991dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 9922d61bbb3SSatish Balay { 9932d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 994dcf5cc72SBarry Smith PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 9953f1db9ecSBarry Smith MatScalar *v; 996dfbe8321SBarry Smith PetscErrorCode ierr; 997a2ea699eSBarry Smith PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 998*7c565772SBarry Smith PetscInt ncols,k,*ridx=NULL; 999ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 10002d61bbb3SSatish Balay 10012d61bbb3SSatish Balay PetscFunctionBegin; 10021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 100326e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 10042d61bbb3SSatish Balay 10052d61bbb3SSatish Balay idx = a->j; 10062d61bbb3SSatish Balay v = a->a; 100726e093fcSHong Zhang if (usecprow) { 100826e093fcSHong Zhang mbs = a->compressedrow.nrows; 100926e093fcSHong Zhang ii = a->compressedrow.i; 10107b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 101126e093fcSHong Zhang } else { 101226e093fcSHong Zhang mbs = a->mbs; 10132d61bbb3SSatish Balay ii = a->i; 101426e093fcSHong Zhang z = zarray; 101526e093fcSHong Zhang } 1016218c64b6SSatish Balay 10172d61bbb3SSatish Balay if (!a->mult_work) { 1018d0f46423SBarry Smith k = PetscMax(A->rmap->n,A->cmap->n); 101987828ca2SBarry Smith ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 10202d61bbb3SSatish Balay } 10212d61bbb3SSatish Balay work = a->mult_work; 10222d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 10232d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 10242d61bbb3SSatish Balay ncols = n*bs; 10252d61bbb3SSatish Balay workt = work; 10262d61bbb3SSatish Balay for (j=0; j<n; j++) { 10272d61bbb3SSatish Balay xb = x + bs*(*idx++); 10282d61bbb3SSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 10292d61bbb3SSatish Balay workt += bs; 10302d61bbb3SSatish Balay } 10317b2bb3b9SHong Zhang if (usecprow) z = zarray + bs*ridx[i]; 103296b95a6bSBarry Smith PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 103371044d3cSBarry Smith /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 10342d61bbb3SSatish Balay v += n*bs2; 103526e093fcSHong Zhang if (!usecprow) z += bs; 10362d61bbb3SSatish Balay } 10371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 103826e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1039*7c565772SBarry Smith ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 10402d61bbb3SSatish Balay PetscFunctionReturn(0); 10412d61bbb3SSatish Balay } 10422d61bbb3SSatish Balay 10434a2ae208SSatish Balay #undef __FUNCT__ 10444a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1045dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 10462d61bbb3SSatish Balay { 10472d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1048122f12eaSBarry Smith const PetscScalar *x; 1049122f12eaSBarry Smith PetscScalar *y,*z,sum; 1050122f12eaSBarry Smith const MatScalar *v; 1051dfbe8321SBarry Smith PetscErrorCode ierr; 1052*7c565772SBarry Smith PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1053122f12eaSBarry Smith const PetscInt *idx,*ii; 1054ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 10552d61bbb3SSatish Balay 10562d61bbb3SSatish Balay PetscFunctionBegin; 10573649974fSBarry Smith ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1058122f12eaSBarry Smith ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 10592e8a6d31SBarry Smith if (zz != yy) { 1060122f12eaSBarry Smith ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 10612e8a6d31SBarry Smith } else { 1062122f12eaSBarry Smith z = y; 10632e8a6d31SBarry Smith } 10642d61bbb3SSatish Balay 10652d61bbb3SSatish Balay idx = a->j; 10662d61bbb3SSatish Balay v = a->a; 106726e093fcSHong Zhang if (usecprow) { 10684eb6d288SHong Zhang if (zz != yy) { 1069122f12eaSBarry Smith ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 10704eb6d288SHong Zhang } 107126e093fcSHong Zhang mbs = a->compressedrow.nrows; 107226e093fcSHong Zhang ii = a->compressedrow.i; 10737b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 107426e093fcSHong Zhang } else { 10752d61bbb3SSatish Balay ii = a->i; 107626e093fcSHong Zhang } 10772d61bbb3SSatish Balay 10782d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 1079122f12eaSBarry Smith n = ii[1] - ii[0]; 1080122f12eaSBarry Smith ii++; 108126e093fcSHong Zhang if (!usecprow) { 1082122f12eaSBarry Smith sum = y[i]; 1083122f12eaSBarry Smith } else { 1084122f12eaSBarry Smith sum = y[ridx[i]]; 1085122f12eaSBarry Smith } 1086444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1087444d8c10SJed Brown PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1088122f12eaSBarry Smith PetscSparseDensePlusDot(sum,x,v,idx,n); 1089122f12eaSBarry Smith v += n; 1090122f12eaSBarry Smith idx += n; 1091122f12eaSBarry Smith if (usecprow) { 1092122f12eaSBarry Smith z[ridx[i]] = sum; 1093122f12eaSBarry Smith } else { 1094122f12eaSBarry Smith z[i] = sum; 109526e093fcSHong Zhang } 10962d61bbb3SSatish Balay } 10973649974fSBarry Smith ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1098122f12eaSBarry Smith ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 10992e8a6d31SBarry Smith if (zz != yy) { 1100122f12eaSBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 11012e8a6d31SBarry Smith } 1102*7c565772SBarry Smith ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 11032d61bbb3SSatish Balay PetscFunctionReturn(0); 11042d61bbb3SSatish Balay } 11052d61bbb3SSatish Balay 11064a2ae208SSatish Balay #undef __FUNCT__ 11074a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1108dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 11092d61bbb3SSatish Balay { 11102d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1111dcf5cc72SBarry Smith PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 111226e093fcSHong Zhang PetscScalar x1,x2,*yarray,*zarray; 11133f1db9ecSBarry Smith MatScalar *v; 1114dfbe8321SBarry Smith PetscErrorCode ierr; 11150298fd71SBarry Smith PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1116ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 11172d61bbb3SSatish Balay 11182d61bbb3SSatish Balay PetscFunctionBegin; 11191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 112026e093fcSHong Zhang ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 11212e8a6d31SBarry Smith if (zz != yy) { 112226e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 11232e8a6d31SBarry Smith } else { 112426e093fcSHong Zhang zarray = yarray; 11252e8a6d31SBarry Smith } 11262d61bbb3SSatish Balay 11272d61bbb3SSatish Balay idx = a->j; 11282d61bbb3SSatish Balay v = a->a; 112926e093fcSHong Zhang if (usecprow) { 11304eb6d288SHong Zhang if (zz != yy) { 11314eb6d288SHong Zhang ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 11324eb6d288SHong Zhang } 113326e093fcSHong Zhang mbs = a->compressedrow.nrows; 113426e093fcSHong Zhang ii = a->compressedrow.i; 11357b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 11364eb6d288SHong Zhang if (zz != yy) { 11374eb6d288SHong Zhang ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 11384eb6d288SHong Zhang } 113926e093fcSHong Zhang } else { 11402d61bbb3SSatish Balay ii = a->i; 114126e093fcSHong Zhang y = yarray; 114226e093fcSHong Zhang z = zarray; 114326e093fcSHong Zhang } 11442d61bbb3SSatish Balay 11452d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 11462d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 114726e093fcSHong Zhang if (usecprow) { 11487b2bb3b9SHong Zhang z = zarray + 2*ridx[i]; 11497b2bb3b9SHong Zhang y = yarray + 2*ridx[i]; 115026e093fcSHong Zhang } 11512d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; 1152444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1153444d8c10SJed Brown PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 11542d61bbb3SSatish Balay for (j=0; j<n; j++) { 115526fbe8dcSKarl Rupp xb = x + 2*(*idx++); 115626fbe8dcSKarl Rupp x1 = xb[0]; 115726fbe8dcSKarl Rupp x2 = xb[1]; 115826fbe8dcSKarl Rupp 11592d61bbb3SSatish Balay sum1 += v[0]*x1 + v[2]*x2; 11602d61bbb3SSatish Balay sum2 += v[1]*x1 + v[3]*x2; 11612d61bbb3SSatish Balay v += 4; 11622d61bbb3SSatish Balay } 11632d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; 116426e093fcSHong Zhang if (!usecprow) { 11652d61bbb3SSatish Balay z += 2; y += 2; 11662d61bbb3SSatish Balay } 116726e093fcSHong Zhang } 11681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 116926e093fcSHong Zhang ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 11702e8a6d31SBarry Smith if (zz != yy) { 117126e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 11722e8a6d31SBarry Smith } 1173dc0b31edSSatish Balay ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 11742d61bbb3SSatish Balay PetscFunctionReturn(0); 11752d61bbb3SSatish Balay } 11762d61bbb3SSatish Balay 11774a2ae208SSatish Balay #undef __FUNCT__ 11784a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1179dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 11802d61bbb3SSatish Balay { 11812d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1182dcf5cc72SBarry Smith PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 11833f1db9ecSBarry Smith MatScalar *v; 1184dfbe8321SBarry Smith PetscErrorCode ierr; 11850298fd71SBarry Smith PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1186ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 11872d61bbb3SSatish Balay 11882d61bbb3SSatish Balay PetscFunctionBegin; 11891ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 119026e093fcSHong Zhang ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 11912e8a6d31SBarry Smith if (zz != yy) { 119226e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 11932e8a6d31SBarry Smith } else { 119426e093fcSHong Zhang zarray = yarray; 11952e8a6d31SBarry Smith } 11962d61bbb3SSatish Balay 11972d61bbb3SSatish Balay idx = a->j; 11982d61bbb3SSatish Balay v = a->a; 119926e093fcSHong Zhang if (usecprow) { 12004eb6d288SHong Zhang if (zz != yy) { 12014eb6d288SHong Zhang ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 12024eb6d288SHong Zhang } 120326e093fcSHong Zhang mbs = a->compressedrow.nrows; 120426e093fcSHong Zhang ii = a->compressedrow.i; 12057b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 120626e093fcSHong Zhang } else { 12072d61bbb3SSatish Balay ii = a->i; 120826e093fcSHong Zhang y = yarray; 120926e093fcSHong Zhang z = zarray; 121026e093fcSHong Zhang } 12112d61bbb3SSatish Balay 12122d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 12132d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 121426e093fcSHong Zhang if (usecprow) { 12157b2bb3b9SHong Zhang z = zarray + 3*ridx[i]; 12167b2bb3b9SHong Zhang y = yarray + 3*ridx[i]; 121726e093fcSHong Zhang } 12182d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1219444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1220444d8c10SJed Brown PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 12212d61bbb3SSatish Balay for (j=0; j<n; j++) { 12222d61bbb3SSatish Balay xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 12232d61bbb3SSatish Balay sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 12242d61bbb3SSatish Balay sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 12252d61bbb3SSatish Balay sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 12262d61bbb3SSatish Balay v += 9; 12272d61bbb3SSatish Balay } 12282d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; 122926e093fcSHong Zhang if (!usecprow) { 12302d61bbb3SSatish Balay z += 3; y += 3; 12312d61bbb3SSatish Balay } 123226e093fcSHong Zhang } 12331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 123426e093fcSHong Zhang ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 12352e8a6d31SBarry Smith if (zz != yy) { 123626e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 12372e8a6d31SBarry Smith } 1238dc0b31edSSatish Balay ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 12392d61bbb3SSatish Balay PetscFunctionReturn(0); 12402d61bbb3SSatish Balay } 12412d61bbb3SSatish Balay 12424a2ae208SSatish Balay #undef __FUNCT__ 12434a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1244dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 12452d61bbb3SSatish Balay { 12462d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1247dcf5cc72SBarry Smith PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 12483f1db9ecSBarry Smith MatScalar *v; 1249dfbe8321SBarry Smith PetscErrorCode ierr; 12500298fd71SBarry Smith PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1251ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 12522d61bbb3SSatish Balay 12532d61bbb3SSatish Balay PetscFunctionBegin; 12541ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 125526e093fcSHong Zhang ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 12562e8a6d31SBarry Smith if (zz != yy) { 125726e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 12582e8a6d31SBarry Smith } else { 125926e093fcSHong Zhang zarray = yarray; 12602e8a6d31SBarry Smith } 12612d61bbb3SSatish Balay 12622d61bbb3SSatish Balay idx = a->j; 12632d61bbb3SSatish Balay v = a->a; 126426e093fcSHong Zhang if (usecprow) { 12654eb6d288SHong Zhang if (zz != yy) { 12664eb6d288SHong Zhang ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 12674eb6d288SHong Zhang } 126826e093fcSHong Zhang mbs = a->compressedrow.nrows; 126926e093fcSHong Zhang ii = a->compressedrow.i; 12707b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 127126e093fcSHong Zhang } else { 12722d61bbb3SSatish Balay ii = a->i; 127326e093fcSHong Zhang y = yarray; 127426e093fcSHong Zhang z = zarray; 127526e093fcSHong Zhang } 12762d61bbb3SSatish Balay 12772d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 12782d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 127926e093fcSHong Zhang if (usecprow) { 12807b2bb3b9SHong Zhang z = zarray + 4*ridx[i]; 12817b2bb3b9SHong Zhang y = yarray + 4*ridx[i]; 128226e093fcSHong Zhang } 12832d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1284444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1285444d8c10SJed Brown PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 12862d61bbb3SSatish Balay for (j=0; j<n; j++) { 12872d61bbb3SSatish Balay xb = x + 4*(*idx++); 12882d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 12892d61bbb3SSatish Balay sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 12902d61bbb3SSatish Balay sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 12912d61bbb3SSatish Balay sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 12922d61bbb3SSatish Balay sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 12932d61bbb3SSatish Balay v += 16; 12942d61bbb3SSatish Balay } 12952d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 129626e093fcSHong Zhang if (!usecprow) { 12972d61bbb3SSatish Balay z += 4; y += 4; 12982d61bbb3SSatish Balay } 129926e093fcSHong Zhang } 13001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 130126e093fcSHong Zhang ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 13022e8a6d31SBarry Smith if (zz != yy) { 130326e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 13042e8a6d31SBarry Smith } 1305dc0b31edSSatish Balay ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 13062d61bbb3SSatish Balay PetscFunctionReturn(0); 13072d61bbb3SSatish Balay } 13082d61bbb3SSatish Balay 13094a2ae208SSatish Balay #undef __FUNCT__ 13104a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1311dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 13122d61bbb3SSatish Balay { 13132d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1314dcf5cc72SBarry Smith PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 131526e093fcSHong Zhang PetscScalar *yarray,*zarray; 13163f1db9ecSBarry Smith MatScalar *v; 1317dfbe8321SBarry Smith PetscErrorCode ierr; 13180298fd71SBarry Smith PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1319ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 13202d61bbb3SSatish Balay 13212d61bbb3SSatish Balay PetscFunctionBegin; 13221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 132326e093fcSHong Zhang ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 13242e8a6d31SBarry Smith if (zz != yy) { 132526e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 13262e8a6d31SBarry Smith } else { 132726e093fcSHong Zhang zarray = yarray; 13282e8a6d31SBarry Smith } 13292d61bbb3SSatish Balay 13302d61bbb3SSatish Balay idx = a->j; 13312d61bbb3SSatish Balay v = a->a; 133226e093fcSHong Zhang if (usecprow) { 13334eb6d288SHong Zhang if (zz != yy) { 13344eb6d288SHong Zhang ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 13354eb6d288SHong Zhang } 133626e093fcSHong Zhang mbs = a->compressedrow.nrows; 133726e093fcSHong Zhang ii = a->compressedrow.i; 13387b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 133926e093fcSHong Zhang } else { 13402d61bbb3SSatish Balay ii = a->i; 134126e093fcSHong Zhang y = yarray; 134226e093fcSHong Zhang z = zarray; 134326e093fcSHong Zhang } 13442d61bbb3SSatish Balay 13452d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 13462d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 134726e093fcSHong Zhang if (usecprow) { 13487b2bb3b9SHong Zhang z = zarray + 5*ridx[i]; 13497b2bb3b9SHong Zhang y = yarray + 5*ridx[i]; 135026e093fcSHong Zhang } 13512d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1352444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1353444d8c10SJed Brown PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 13542d61bbb3SSatish Balay for (j=0; j<n; j++) { 13552d61bbb3SSatish Balay xb = x + 5*(*idx++); 13562d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 13572d61bbb3SSatish Balay sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 13582d61bbb3SSatish Balay sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 13592d61bbb3SSatish Balay sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 13602d61bbb3SSatish Balay sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 13612d61bbb3SSatish Balay sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 13622d61bbb3SSatish Balay v += 25; 13632d61bbb3SSatish Balay } 13642d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 136526e093fcSHong Zhang if (!usecprow) { 13662d61bbb3SSatish Balay z += 5; y += 5; 13672d61bbb3SSatish Balay } 136826e093fcSHong Zhang } 13691ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 137026e093fcSHong Zhang ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 13712e8a6d31SBarry Smith if (zz != yy) { 137226e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 13732e8a6d31SBarry Smith } 1374dc0b31edSSatish Balay ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 13752d61bbb3SSatish Balay PetscFunctionReturn(0); 13762d61bbb3SSatish Balay } 13774a2ae208SSatish Balay #undef __FUNCT__ 13784a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1379dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 138015091d37SBarry Smith { 138115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1382dcf5cc72SBarry Smith PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 138326e093fcSHong Zhang PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 138415091d37SBarry Smith MatScalar *v; 1385dfbe8321SBarry Smith PetscErrorCode ierr; 13860298fd71SBarry Smith PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1387ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 138815091d37SBarry Smith 138915091d37SBarry Smith PetscFunctionBegin; 13901ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 139126e093fcSHong Zhang ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 139215091d37SBarry Smith if (zz != yy) { 139326e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 139415091d37SBarry Smith } else { 139526e093fcSHong Zhang zarray = yarray; 139615091d37SBarry Smith } 139715091d37SBarry Smith 139815091d37SBarry Smith idx = a->j; 139915091d37SBarry Smith v = a->a; 140026e093fcSHong Zhang if (usecprow) { 14014eb6d288SHong Zhang if (zz != yy) { 14024eb6d288SHong Zhang ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 14034eb6d288SHong Zhang } 140426e093fcSHong Zhang mbs = a->compressedrow.nrows; 140526e093fcSHong Zhang ii = a->compressedrow.i; 14067b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 140726e093fcSHong Zhang } else { 140815091d37SBarry Smith ii = a->i; 140926e093fcSHong Zhang y = yarray; 141026e093fcSHong Zhang z = zarray; 141126e093fcSHong Zhang } 141215091d37SBarry Smith 141315091d37SBarry Smith for (i=0; i<mbs; i++) { 141415091d37SBarry Smith n = ii[1] - ii[0]; ii++; 141526e093fcSHong Zhang if (usecprow) { 14167b2bb3b9SHong Zhang z = zarray + 6*ridx[i]; 14177b2bb3b9SHong Zhang y = yarray + 6*ridx[i]; 141826e093fcSHong Zhang } 141915091d37SBarry Smith sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1420444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1421444d8c10SJed Brown PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 142215091d37SBarry Smith for (j=0; j<n; j++) { 14233b95cb0eSSatish Balay xb = x + 6*(*idx++); 142415091d37SBarry Smith x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 142515091d37SBarry Smith sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 142615091d37SBarry Smith sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 142715091d37SBarry Smith sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 142815091d37SBarry Smith sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 142915091d37SBarry Smith sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 143015091d37SBarry Smith sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 143115091d37SBarry Smith v += 36; 143215091d37SBarry Smith } 143315091d37SBarry Smith z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 143426e093fcSHong Zhang if (!usecprow) { 143515091d37SBarry Smith z += 6; y += 6; 143615091d37SBarry Smith } 143726e093fcSHong Zhang } 14381ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 143926e093fcSHong Zhang ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 144015091d37SBarry Smith if (zz != yy) { 144126e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 144215091d37SBarry Smith } 1443dc0b31edSSatish Balay ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 144415091d37SBarry Smith PetscFunctionReturn(0); 144515091d37SBarry Smith } 14462d61bbb3SSatish Balay 14474a2ae208SSatish Balay #undef __FUNCT__ 14484a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1449dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 14502d61bbb3SSatish Balay { 14512d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1452dcf5cc72SBarry Smith PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 145326e093fcSHong Zhang PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 14543f1db9ecSBarry Smith MatScalar *v; 1455dfbe8321SBarry Smith PetscErrorCode ierr; 14560298fd71SBarry Smith PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1457ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 14582d61bbb3SSatish Balay 14592d61bbb3SSatish Balay PetscFunctionBegin; 14601ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 146126e093fcSHong Zhang ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 14622e8a6d31SBarry Smith if (zz != yy) { 146326e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 14642e8a6d31SBarry Smith } else { 146526e093fcSHong Zhang zarray = yarray; 14662e8a6d31SBarry Smith } 14672d61bbb3SSatish Balay 14682d61bbb3SSatish Balay idx = a->j; 14692d61bbb3SSatish Balay v = a->a; 147026e093fcSHong Zhang if (usecprow) { 14714eb6d288SHong Zhang if (zz != yy) { 14724eb6d288SHong Zhang ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 14734eb6d288SHong Zhang } 147426e093fcSHong Zhang mbs = a->compressedrow.nrows; 147526e093fcSHong Zhang ii = a->compressedrow.i; 14767b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 147726e093fcSHong Zhang } else { 14782d61bbb3SSatish Balay ii = a->i; 147926e093fcSHong Zhang y = yarray; 148026e093fcSHong Zhang z = zarray; 148126e093fcSHong Zhang } 14822d61bbb3SSatish Balay 14832d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 14842d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 148526e093fcSHong Zhang if (usecprow) { 14867b2bb3b9SHong Zhang z = zarray + 7*ridx[i]; 14877b2bb3b9SHong Zhang y = yarray + 7*ridx[i]; 148826e093fcSHong Zhang } 14892d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1490444d8c10SJed Brown PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1491444d8c10SJed Brown PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 14922d61bbb3SSatish Balay for (j=0; j<n; j++) { 14932d61bbb3SSatish Balay xb = x + 7*(*idx++); 14942d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 14952d61bbb3SSatish Balay sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 14962d61bbb3SSatish Balay sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 14972d61bbb3SSatish Balay sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 14982d61bbb3SSatish Balay sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 14992d61bbb3SSatish Balay sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 15002d61bbb3SSatish Balay sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 15012d61bbb3SSatish Balay sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 15022d61bbb3SSatish Balay v += 49; 15032d61bbb3SSatish Balay } 15042d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 150526e093fcSHong Zhang if (!usecprow) { 15062d61bbb3SSatish Balay z += 7; y += 7; 15072d61bbb3SSatish Balay } 150826e093fcSHong Zhang } 15091ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 151026e093fcSHong Zhang ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 15112e8a6d31SBarry Smith if (zz != yy) { 151226e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 15132e8a6d31SBarry Smith } 1514dc0b31edSSatish Balay ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 15152d61bbb3SSatish Balay PetscFunctionReturn(0); 15162d61bbb3SSatish Balay } 1517218c64b6SSatish Balay 15184a2ae208SSatish Balay #undef __FUNCT__ 15194a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1520dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 15212d61bbb3SSatish Balay { 15222d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1523bba805e6SBarry Smith PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 15243f1db9ecSBarry Smith MatScalar *v; 15256849ba73SBarry Smith PetscErrorCode ierr; 1526d0f46423SBarry Smith PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 15270298fd71SBarry Smith PetscInt ncols,k,*ridx=NULL; 1528ace3abfcSBarry Smith PetscBool usecprow=a->compressedrow.use; 1529218c64b6SSatish Balay 15302d61bbb3SSatish Balay PetscFunctionBegin; 1531343551c4SBarry Smith ierr = VecCopy(yy,zz);CHKERRQ(ierr); 15321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 153326e093fcSHong Zhang ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 15342d61bbb3SSatish Balay 15352d61bbb3SSatish Balay idx = a->j; 15362d61bbb3SSatish Balay v = a->a; 153726e093fcSHong Zhang if (usecprow) { 153826e093fcSHong Zhang mbs = a->compressedrow.nrows; 153926e093fcSHong Zhang ii = a->compressedrow.i; 15407b2bb3b9SHong Zhang ridx = a->compressedrow.rindex; 154126e093fcSHong Zhang } else { 154226e093fcSHong Zhang mbs = a->mbs; 15432d61bbb3SSatish Balay ii = a->i; 154426e093fcSHong Zhang z = zarray; 154526e093fcSHong Zhang } 15462d61bbb3SSatish Balay 15472d61bbb3SSatish Balay if (!a->mult_work) { 1548d0f46423SBarry Smith k = PetscMax(A->rmap->n,A->cmap->n); 154987828ca2SBarry Smith ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 15502d61bbb3SSatish Balay } 15512d61bbb3SSatish Balay work = a->mult_work; 15522d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 15532d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 15542d61bbb3SSatish Balay ncols = n*bs; 15552d61bbb3SSatish Balay workt = work; 15562d61bbb3SSatish Balay for (j=0; j<n; j++) { 15572d61bbb3SSatish Balay xb = x + bs*(*idx++); 15582d61bbb3SSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 15592d61bbb3SSatish Balay workt += bs; 15602d61bbb3SSatish Balay } 15617b2bb3b9SHong Zhang if (usecprow) z = zarray + bs*ridx[i]; 156296b95a6bSBarry Smith PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 156371044d3cSBarry Smith /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 15642d61bbb3SSatish Balay v += n*bs2; 156526fbe8dcSKarl Rupp if (!usecprow) z += bs; 156626e093fcSHong Zhang } 15671ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 156826e093fcSHong Zhang ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1569dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 15702d61bbb3SSatish Balay PetscFunctionReturn(0); 15712d61bbb3SSatish Balay } 15722d61bbb3SSatish Balay 15734a2ae208SSatish Balay #undef __FUNCT__ 1574547795f9SHong Zhang #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1575547795f9SHong Zhang PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1576547795f9SHong Zhang { 1577547795f9SHong Zhang PetscScalar zero = 0.0; 1578547795f9SHong Zhang PetscErrorCode ierr; 1579547795f9SHong Zhang 1580547795f9SHong Zhang PetscFunctionBegin; 1581547795f9SHong Zhang ierr = VecSet(zz,zero);CHKERRQ(ierr); 1582547795f9SHong Zhang ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1583547795f9SHong Zhang PetscFunctionReturn(0); 1584547795f9SHong Zhang } 1585547795f9SHong Zhang 1586547795f9SHong Zhang #undef __FUNCT__ 15874a2ae208SSatish Balay #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1588dfbe8321SBarry Smith PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 15892d61bbb3SSatish Balay { 15903447b6efSHong Zhang PetscScalar zero = 0.0; 15916849ba73SBarry Smith PetscErrorCode ierr; 15922d61bbb3SSatish Balay 15932d61bbb3SSatish Balay PetscFunctionBegin; 15942dcb1b2aSMatthew Knepley ierr = VecSet(zz,zero);CHKERRQ(ierr); 15953447b6efSHong Zhang ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 15962d61bbb3SSatish Balay PetscFunctionReturn(0); 15972d61bbb3SSatish Balay } 15982d61bbb3SSatish Balay 15994a2ae208SSatish Balay #undef __FUNCT__ 1600547795f9SHong Zhang #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1601547795f9SHong Zhang PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1602547795f9SHong Zhang { 1603547795f9SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1604547795f9SHong Zhang PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1605547795f9SHong Zhang MatScalar *v; 1606547795f9SHong Zhang PetscErrorCode ierr; 16070298fd71SBarry Smith PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 1608547795f9SHong Zhang Mat_CompressedRow cprow = a->compressedrow; 1609ace3abfcSBarry Smith PetscBool usecprow=cprow.use; 1610547795f9SHong Zhang 1611547795f9SHong Zhang PetscFunctionBegin; 1612343551c4SBarry Smith if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1613547795f9SHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1614547795f9SHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1615547795f9SHong Zhang 1616547795f9SHong Zhang idx = a->j; 1617547795f9SHong Zhang v = a->a; 1618547795f9SHong Zhang if (usecprow) { 1619547795f9SHong Zhang mbs = cprow.nrows; 1620547795f9SHong Zhang ii = cprow.i; 1621547795f9SHong Zhang ridx = cprow.rindex; 1622547795f9SHong Zhang } else { 1623547795f9SHong Zhang mbs=a->mbs; 1624547795f9SHong Zhang ii = a->i; 1625547795f9SHong Zhang xb = x; 1626547795f9SHong Zhang } 1627547795f9SHong Zhang 1628547795f9SHong Zhang switch (bs) { 1629547795f9SHong Zhang case 1: 1630547795f9SHong Zhang for (i=0; i<mbs; i++) { 1631547795f9SHong Zhang if (usecprow) xb = x + ridx[i]; 1632547795f9SHong Zhang x1 = xb[0]; 1633547795f9SHong Zhang ib = idx + ii[0]; 1634547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 1635547795f9SHong Zhang for (j=0; j<n; j++) { 1636547795f9SHong Zhang rval = ib[j]; 1637547795f9SHong Zhang z[rval] += PetscConj(*v) * x1; 1638547795f9SHong Zhang v++; 1639547795f9SHong Zhang } 1640547795f9SHong Zhang if (!usecprow) xb++; 1641547795f9SHong Zhang } 1642547795f9SHong Zhang break; 1643547795f9SHong Zhang case 2: 1644547795f9SHong Zhang for (i=0; i<mbs; i++) { 1645547795f9SHong Zhang if (usecprow) xb = x + 2*ridx[i]; 1646547795f9SHong Zhang x1 = xb[0]; x2 = xb[1]; 1647547795f9SHong Zhang ib = idx + ii[0]; 1648547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 1649547795f9SHong Zhang for (j=0; j<n; j++) { 1650547795f9SHong Zhang rval = ib[j]*2; 1651547795f9SHong Zhang z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1652547795f9SHong Zhang z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1653547795f9SHong Zhang v += 4; 1654547795f9SHong Zhang } 1655547795f9SHong Zhang if (!usecprow) xb += 2; 1656547795f9SHong Zhang } 1657547795f9SHong Zhang break; 1658547795f9SHong Zhang case 3: 1659547795f9SHong Zhang for (i=0; i<mbs; i++) { 1660547795f9SHong Zhang if (usecprow) xb = x + 3*ridx[i]; 1661547795f9SHong Zhang x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1662547795f9SHong Zhang ib = idx + ii[0]; 1663547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 1664547795f9SHong Zhang for (j=0; j<n; j++) { 1665547795f9SHong Zhang rval = ib[j]*3; 1666547795f9SHong Zhang z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1667547795f9SHong Zhang z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1668547795f9SHong Zhang z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1669547795f9SHong Zhang v += 9; 1670547795f9SHong Zhang } 1671547795f9SHong Zhang if (!usecprow) xb += 3; 1672547795f9SHong Zhang } 1673547795f9SHong Zhang break; 1674547795f9SHong Zhang case 4: 1675547795f9SHong Zhang for (i=0; i<mbs; i++) { 1676547795f9SHong Zhang if (usecprow) xb = x + 4*ridx[i]; 1677547795f9SHong Zhang x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1678547795f9SHong Zhang ib = idx + ii[0]; 1679547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 1680547795f9SHong Zhang for (j=0; j<n; j++) { 1681547795f9SHong Zhang rval = ib[j]*4; 1682547795f9SHong Zhang z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1683547795f9SHong Zhang z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1684547795f9SHong Zhang z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1685547795f9SHong Zhang z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1686547795f9SHong Zhang v += 16; 1687547795f9SHong Zhang } 1688547795f9SHong Zhang if (!usecprow) xb += 4; 1689547795f9SHong Zhang } 1690547795f9SHong Zhang break; 1691547795f9SHong Zhang case 5: 1692547795f9SHong Zhang for (i=0; i<mbs; i++) { 1693547795f9SHong Zhang if (usecprow) xb = x + 5*ridx[i]; 1694547795f9SHong Zhang x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1695547795f9SHong Zhang x4 = xb[3]; x5 = xb[4]; 1696547795f9SHong Zhang ib = idx + ii[0]; 1697547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 1698547795f9SHong Zhang for (j=0; j<n; j++) { 1699547795f9SHong Zhang rval = ib[j]*5; 1700547795f9SHong Zhang z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1701547795f9SHong Zhang z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1702547795f9SHong Zhang z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1703547795f9SHong Zhang z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1704547795f9SHong Zhang z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1705547795f9SHong Zhang v += 25; 1706547795f9SHong Zhang } 1707547795f9SHong Zhang if (!usecprow) xb += 5; 1708547795f9SHong Zhang } 1709547795f9SHong Zhang break; 1710547795f9SHong Zhang default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1711547795f9SHong Zhang PetscInt ncols,k; 1712547795f9SHong Zhang PetscScalar *work,*workt,*xtmp; 1713547795f9SHong Zhang 1714e32f2f54SBarry Smith SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1715547795f9SHong Zhang if (!a->mult_work) { 1716547795f9SHong Zhang k = PetscMax(A->rmap->n,A->cmap->n); 1717547795f9SHong Zhang ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1718547795f9SHong Zhang } 1719547795f9SHong Zhang work = a->mult_work; 1720547795f9SHong Zhang xtmp = x; 1721547795f9SHong Zhang for (i=0; i<mbs; i++) { 1722547795f9SHong Zhang n = ii[1] - ii[0]; ii++; 1723547795f9SHong Zhang ncols = n*bs; 1724547795f9SHong Zhang ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 172526fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 172696b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1727547795f9SHong Zhang /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1728547795f9SHong Zhang v += n*bs2; 1729547795f9SHong Zhang if (!usecprow) xtmp += bs; 1730547795f9SHong Zhang workt = work; 1731547795f9SHong Zhang for (j=0; j<n; j++) { 1732547795f9SHong Zhang zb = z + bs*(*idx++); 1733547795f9SHong Zhang for (k=0; k<bs; k++) zb[k] += workt[k] ; 1734547795f9SHong Zhang workt += bs; 1735547795f9SHong Zhang } 1736547795f9SHong Zhang } 1737547795f9SHong Zhang } 1738547795f9SHong Zhang } 1739547795f9SHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1740547795f9SHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1741547795f9SHong Zhang ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1742547795f9SHong Zhang PetscFunctionReturn(0); 1743547795f9SHong Zhang } 1744547795f9SHong Zhang 1745547795f9SHong Zhang #undef __FUNCT__ 17464a2ae208SSatish Balay #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1747dfbe8321SBarry Smith PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 17482d61bbb3SSatish Balay { 17492d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1750dcf5cc72SBarry Smith PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 17513f1db9ecSBarry Smith MatScalar *v; 17526849ba73SBarry Smith PetscErrorCode ierr; 17530298fd71SBarry Smith PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 17543447b6efSHong Zhang Mat_CompressedRow cprow = a->compressedrow; 1755ace3abfcSBarry Smith PetscBool usecprow=cprow.use; 17562d61bbb3SSatish Balay 17572d61bbb3SSatish Balay PetscFunctionBegin; 1758343551c4SBarry Smith if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 17593447b6efSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 17603447b6efSHong Zhang ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 17612d61bbb3SSatish Balay 17622d61bbb3SSatish Balay idx = a->j; 17632d61bbb3SSatish Balay v = a->a; 17643447b6efSHong Zhang if (usecprow) { 17653447b6efSHong Zhang mbs = cprow.nrows; 17663447b6efSHong Zhang ii = cprow.i; 17677b2bb3b9SHong Zhang ridx = cprow.rindex; 17683447b6efSHong Zhang } else { 17693447b6efSHong Zhang mbs=a->mbs; 17702d61bbb3SSatish Balay ii = a->i; 1771f1af5d2fSBarry Smith xb = x; 17723447b6efSHong Zhang } 17732d61bbb3SSatish Balay 17742d61bbb3SSatish Balay switch (bs) { 17752d61bbb3SSatish Balay case 1: 17762d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 17777b2bb3b9SHong Zhang if (usecprow) xb = x + ridx[i]; 1778f1af5d2fSBarry Smith x1 = xb[0]; 17793447b6efSHong Zhang ib = idx + ii[0]; 17803447b6efSHong Zhang n = ii[1] - ii[0]; ii++; 17812d61bbb3SSatish Balay for (j=0; j<n; j++) { 17822d61bbb3SSatish Balay rval = ib[j]; 1783f1af5d2fSBarry Smith z[rval] += *v * x1; 1784f1af5d2fSBarry Smith v++; 17852d61bbb3SSatish Balay } 17863447b6efSHong Zhang if (!usecprow) xb++; 17872d61bbb3SSatish Balay } 17882d61bbb3SSatish Balay break; 17892d61bbb3SSatish Balay case 2: 17902d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 17917b2bb3b9SHong Zhang if (usecprow) xb = x + 2*ridx[i]; 1792f1af5d2fSBarry Smith x1 = xb[0]; x2 = xb[1]; 17933447b6efSHong Zhang ib = idx + ii[0]; 17943447b6efSHong Zhang n = ii[1] - ii[0]; ii++; 17952d61bbb3SSatish Balay for (j=0; j<n; j++) { 17962d61bbb3SSatish Balay rval = ib[j]*2; 17972d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2; 17982d61bbb3SSatish Balay z[rval++] += v[2]*x1 + v[3]*x2; 17992d61bbb3SSatish Balay v += 4; 18002d61bbb3SSatish Balay } 18013447b6efSHong Zhang if (!usecprow) xb += 2; 18022d61bbb3SSatish Balay } 18032d61bbb3SSatish Balay break; 18042d61bbb3SSatish Balay case 3: 18052d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 18067b2bb3b9SHong Zhang if (usecprow) xb = x + 3*ridx[i]; 1807f1af5d2fSBarry Smith x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 18083447b6efSHong Zhang ib = idx + ii[0]; 18093447b6efSHong Zhang n = ii[1] - ii[0]; ii++; 18102d61bbb3SSatish Balay for (j=0; j<n; j++) { 18112d61bbb3SSatish Balay rval = ib[j]*3; 18122d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 18132d61bbb3SSatish Balay z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 18142d61bbb3SSatish Balay z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 18152d61bbb3SSatish Balay v += 9; 18162d61bbb3SSatish Balay } 18173447b6efSHong Zhang if (!usecprow) xb += 3; 18182d61bbb3SSatish Balay } 18192d61bbb3SSatish Balay break; 18202d61bbb3SSatish Balay case 4: 18212d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 18227b2bb3b9SHong Zhang if (usecprow) xb = x + 4*ridx[i]; 1823f1af5d2fSBarry Smith x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 18243447b6efSHong Zhang ib = idx + ii[0]; 18253447b6efSHong Zhang n = ii[1] - ii[0]; ii++; 18262d61bbb3SSatish Balay for (j=0; j<n; j++) { 18272d61bbb3SSatish Balay rval = ib[j]*4; 18282d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 18292d61bbb3SSatish Balay z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 18302d61bbb3SSatish Balay z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 18312d61bbb3SSatish Balay z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 18322d61bbb3SSatish Balay v += 16; 18332d61bbb3SSatish Balay } 18343447b6efSHong Zhang if (!usecprow) xb += 4; 18352d61bbb3SSatish Balay } 18362d61bbb3SSatish Balay break; 18372d61bbb3SSatish Balay case 5: 18382d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 18397b2bb3b9SHong Zhang if (usecprow) xb = x + 5*ridx[i]; 1840f1af5d2fSBarry Smith x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 18412d61bbb3SSatish Balay x4 = xb[3]; x5 = xb[4]; 18423447b6efSHong Zhang ib = idx + ii[0]; 18433447b6efSHong Zhang n = ii[1] - ii[0]; ii++; 18442d61bbb3SSatish Balay for (j=0; j<n; j++) { 18452d61bbb3SSatish Balay rval = ib[j]*5; 18462d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 18472d61bbb3SSatish Balay z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 18482d61bbb3SSatish Balay z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 18492d61bbb3SSatish Balay z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 18502d61bbb3SSatish Balay z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 18512d61bbb3SSatish Balay v += 25; 18522d61bbb3SSatish Balay } 18533447b6efSHong Zhang if (!usecprow) xb += 5; 18542d61bbb3SSatish Balay } 18552d61bbb3SSatish Balay break; 1856f1af5d2fSBarry Smith default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1857690b6cddSBarry Smith PetscInt ncols,k; 18583447b6efSHong Zhang PetscScalar *work,*workt,*xtmp; 18593f1db9ecSBarry Smith 18602d61bbb3SSatish Balay if (!a->mult_work) { 1861d0f46423SBarry Smith k = PetscMax(A->rmap->n,A->cmap->n); 186287828ca2SBarry Smith ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 18632d61bbb3SSatish Balay } 18642d61bbb3SSatish Balay work = a->mult_work; 18653447b6efSHong Zhang xtmp = x; 18662d61bbb3SSatish Balay for (i=0; i<mbs; i++) { 18672d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 18682d61bbb3SSatish Balay ncols = n*bs; 186987828ca2SBarry Smith ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 187026fbe8dcSKarl Rupp if (usecprow) xtmp = x + bs*ridx[i]; 187196b95a6bSBarry Smith PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 187271044d3cSBarry Smith /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 18732d61bbb3SSatish Balay v += n*bs2; 18743447b6efSHong Zhang if (!usecprow) xtmp += bs; 18752d61bbb3SSatish Balay workt = work; 18762d61bbb3SSatish Balay for (j=0; j<n; j++) { 18772d61bbb3SSatish Balay zb = z + bs*(*idx++); 18782d61bbb3SSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k]; 18792d61bbb3SSatish Balay workt += bs; 18802d61bbb3SSatish Balay } 18812d61bbb3SSatish Balay } 18822d61bbb3SSatish Balay } 18832d61bbb3SSatish Balay } 18843447b6efSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 18853447b6efSHong Zhang ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1886dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 18872d61bbb3SSatish Balay PetscFunctionReturn(0); 18882d61bbb3SSatish Balay } 18892d61bbb3SSatish Balay 18904a2ae208SSatish Balay #undef __FUNCT__ 18914a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqBAIJ" 1892f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 18932d61bbb3SSatish Balay { 18942d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1895690b6cddSBarry Smith PetscInt totalnz = a->bs2*a->nz; 1896f4df32b1SMatthew Knepley PetscScalar oalpha = alpha; 1897efee365bSSatish Balay PetscErrorCode ierr; 1898c5df96a5SBarry Smith PetscBLASInt one = 1,tnz; 18992d61bbb3SSatish Balay 19002d61bbb3SSatish Balay PetscFunctionBegin; 1901c5df96a5SBarry Smith ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 19028b83055fSJed Brown PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1903efee365bSSatish Balay ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 19042d61bbb3SSatish Balay PetscFunctionReturn(0); 19052d61bbb3SSatish Balay } 19062d61bbb3SSatish Balay 19074a2ae208SSatish Balay #undef __FUNCT__ 19084a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqBAIJ" 1909dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 19102d61bbb3SSatish Balay { 19118a62d963SHong Zhang PetscErrorCode ierr; 19122d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 19133f1db9ecSBarry Smith MatScalar *v = a->a; 1914329f5518SBarry Smith PetscReal sum = 0.0; 1915d0f46423SBarry Smith PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 19162d61bbb3SSatish Balay 19172d61bbb3SSatish Balay PetscFunctionBegin; 19182d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 19192d61bbb3SSatish Balay for (i=0; i< bs2*nz; i++) { 1920329f5518SBarry Smith sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 19212d61bbb3SSatish Balay } 19228f1a2a5eSBarry Smith *norm = PetscSqrtReal(sum); 19238a62d963SHong Zhang } else if (type == NORM_1) { /* maximum column sum */ 19248a62d963SHong Zhang PetscReal *tmp; 19258a62d963SHong Zhang PetscInt *bcol = a->j; 1926d0f46423SBarry Smith ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1927d0f46423SBarry Smith ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 19288a62d963SHong Zhang for (i=0; i<nz; i++) { 19298a62d963SHong Zhang for (j=0; j<bs; j++) { 19308a62d963SHong Zhang k1 = bs*(*bcol) + j; /* column index */ 19318a62d963SHong Zhang for (k=0; k<bs; k++) { 19328a62d963SHong Zhang tmp[k1] += PetscAbsScalar(*v); v++; 19338a62d963SHong Zhang } 19348a62d963SHong Zhang } 19358a62d963SHong Zhang bcol++; 19368a62d963SHong Zhang } 19378a62d963SHong Zhang *norm = 0.0; 1938d0f46423SBarry Smith for (j=0; j<A->cmap->n; j++) { 19398a62d963SHong Zhang if (tmp[j] > *norm) *norm = tmp[j]; 19408a62d963SHong Zhang } 19418a62d963SHong Zhang ierr = PetscFree(tmp);CHKERRQ(ierr); 1942596552b5SBarry Smith } else if (type == NORM_INFINITY) { /* maximum row sum */ 1943596552b5SBarry Smith *norm = 0.0; 1944596552b5SBarry Smith for (k=0; k<bs; k++) { 194574f84c7bSSatish Balay for (j=0; j<a->mbs; j++) { 1946596552b5SBarry Smith v = a->a + bs2*a->i[j] + k; 1947596552b5SBarry Smith sum = 0.0; 1948596552b5SBarry Smith for (i=0; i<a->i[j+1]-a->i[j]; i++) { 19490e90e235SBarry Smith for (k1=0; k1<bs; k1++) { 1950596552b5SBarry Smith sum += PetscAbsScalar(*v); 1951596552b5SBarry Smith v += bs; 19522d61bbb3SSatish Balay } 19530e90e235SBarry Smith } 1954596552b5SBarry Smith if (sum > *norm) *norm = sum; 1955596552b5SBarry Smith } 1956596552b5SBarry Smith } 1957e7e72b3dSBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 19582d61bbb3SSatish Balay PetscFunctionReturn(0); 19592d61bbb3SSatish Balay } 19602d61bbb3SSatish Balay 19612d61bbb3SSatish Balay 19624a2ae208SSatish Balay #undef __FUNCT__ 19634a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqBAIJ" 1964ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 19652d61bbb3SSatish Balay { 19662d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1967dfbe8321SBarry Smith PetscErrorCode ierr; 19682d61bbb3SSatish Balay 19692d61bbb3SSatish Balay PetscFunctionBegin; 19702d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1971d0f46423SBarry Smith if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1972273d9f13SBarry Smith *flg = PETSC_FALSE; 1973273d9f13SBarry Smith PetscFunctionReturn(0); 19742d61bbb3SSatish Balay } 19752d61bbb3SSatish Balay 19762d61bbb3SSatish Balay /* if the a->i are the same */ 1977690b6cddSBarry Smith ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 197826fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 19792d61bbb3SSatish Balay 19802d61bbb3SSatish Balay /* if a->j are the same */ 1981690b6cddSBarry Smith ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 198226fbe8dcSKarl Rupp if (!*flg) PetscFunctionReturn(0); 198326fbe8dcSKarl Rupp 19842d61bbb3SSatish Balay /* if a->a are the same */ 1985d0f46423SBarry Smith ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 19862d61bbb3SSatish Balay PetscFunctionReturn(0); 19872d61bbb3SSatish Balay 19882d61bbb3SSatish Balay } 19892d61bbb3SSatish Balay 19904a2ae208SSatish Balay #undef __FUNCT__ 19914a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1992dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 19932d61bbb3SSatish Balay { 19942d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1995dfbe8321SBarry Smith PetscErrorCode ierr; 1996690b6cddSBarry Smith PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 199787828ca2SBarry Smith PetscScalar *x,zero = 0.0; 19983f1db9ecSBarry Smith MatScalar *aa,*aa_j; 19992d61bbb3SSatish Balay 20002d61bbb3SSatish Balay PetscFunctionBegin; 2001e32f2f54SBarry Smith if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2002d0f46423SBarry Smith bs = A->rmap->bs; 20032d61bbb3SSatish Balay aa = a->a; 20042d61bbb3SSatish Balay ai = a->i; 20052d61bbb3SSatish Balay aj = a->j; 20062d61bbb3SSatish Balay ambs = a->mbs; 20072d61bbb3SSatish Balay bs2 = a->bs2; 20082d61bbb3SSatish Balay 20092dcb1b2aSMatthew Knepley ierr = VecSet(v,zero);CHKERRQ(ierr); 20101ebc52fbSHong Zhang ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2011e1311b90SBarry Smith ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2012e32f2f54SBarry Smith if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 20132d61bbb3SSatish Balay for (i=0; i<ambs; i++) { 20142d61bbb3SSatish Balay for (j=ai[i]; j<ai[i+1]; j++) { 20152d61bbb3SSatish Balay if (aj[j] == i) { 20162d61bbb3SSatish Balay row = i*bs; 20172d61bbb3SSatish Balay aa_j = aa+j*bs2; 20182d61bbb3SSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 20192d61bbb3SSatish Balay break; 20202d61bbb3SSatish Balay } 20212d61bbb3SSatish Balay } 20222d61bbb3SSatish Balay } 20231ebc52fbSHong Zhang ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 20242d61bbb3SSatish Balay PetscFunctionReturn(0); 20252d61bbb3SSatish Balay } 20262d61bbb3SSatish Balay 20274a2ae208SSatish Balay #undef __FUNCT__ 20284a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2029dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 20302d61bbb3SSatish Balay { 20312d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 203253ef36baSBarry Smith const PetscScalar *l,*r,*li,*ri; 203353ef36baSBarry Smith PetscScalar x; 20343f1db9ecSBarry Smith MatScalar *aa, *v; 2035dfbe8321SBarry Smith PetscErrorCode ierr; 203653ef36baSBarry Smith PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 203753ef36baSBarry Smith const PetscInt *ai,*aj; 20382d61bbb3SSatish Balay 20392d61bbb3SSatish Balay PetscFunctionBegin; 20402d61bbb3SSatish Balay ai = a->i; 20412d61bbb3SSatish Balay aj = a->j; 20422d61bbb3SSatish Balay aa = a->a; 2043d0f46423SBarry Smith m = A->rmap->n; 2044d0f46423SBarry Smith n = A->cmap->n; 2045d0f46423SBarry Smith bs = A->rmap->bs; 20462d61bbb3SSatish Balay mbs = a->mbs; 20472d61bbb3SSatish Balay bs2 = a->bs2; 20482d61bbb3SSatish Balay if (ll) { 204953ef36baSBarry Smith ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2050e1311b90SBarry Smith ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2051e32f2f54SBarry Smith if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 20522d61bbb3SSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 20532d61bbb3SSatish Balay M = ai[i+1] - ai[i]; 20542d61bbb3SSatish Balay li = l + i*bs; 20552d61bbb3SSatish Balay v = aa + bs2*ai[i]; 20562d61bbb3SSatish Balay for (j=0; j<M; j++) { /* for each block */ 20572d61bbb3SSatish Balay for (k=0; k<bs2; k++) { 20582d61bbb3SSatish Balay (*v++) *= li[k%bs]; 20592d61bbb3SSatish Balay } 20602d61bbb3SSatish Balay } 20612d61bbb3SSatish Balay } 206253ef36baSBarry Smith ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2063efee365bSSatish Balay ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 20642d61bbb3SSatish Balay } 20652d61bbb3SSatish Balay 20662d61bbb3SSatish Balay if (rr) { 206753ef36baSBarry Smith ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2068e1311b90SBarry Smith ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2069e32f2f54SBarry Smith if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 20702d61bbb3SSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 207153ef36baSBarry Smith iai = ai[i]; 207253ef36baSBarry Smith M = ai[i+1] - iai; 207353ef36baSBarry Smith v = aa + bs2*iai; 20742d61bbb3SSatish Balay for (j=0; j<M; j++) { /* for each block */ 207553ef36baSBarry Smith ri = r + bs*aj[iai+j]; 20762d61bbb3SSatish Balay for (k=0; k<bs; k++) { 20772d61bbb3SSatish Balay x = ri[k]; 207853ef36baSBarry Smith for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 207953ef36baSBarry Smith v += bs; 20802d61bbb3SSatish Balay } 20812d61bbb3SSatish Balay } 20822d61bbb3SSatish Balay } 208353ef36baSBarry Smith ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2084efee365bSSatish Balay ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 20852d61bbb3SSatish Balay } 20862d61bbb3SSatish Balay PetscFunctionReturn(0); 20872d61bbb3SSatish Balay } 20882d61bbb3SSatish Balay 20892d61bbb3SSatish Balay 20904a2ae208SSatish Balay #undef __FUNCT__ 20914a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2092dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 20932d61bbb3SSatish Balay { 20942d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 20952d61bbb3SSatish Balay 20962d61bbb3SSatish Balay PetscFunctionBegin; 20972d61bbb3SSatish Balay info->block_size = a->bs2; 2098ceed8ce5SJed Brown info->nz_allocated = a->bs2*a->maxnz; 20992d61bbb3SSatish Balay info->nz_used = a->bs2*a->nz; 21002d61bbb3SSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 21012d61bbb3SSatish Balay info->assemblies = A->num_ass; 21028e58a170SBarry Smith info->mallocs = A->info.mallocs; 21037adad957SLisandro Dalcin info->memory = ((PetscObject)A)->mem; 2104d5f3da31SBarry Smith if (A->factortype) { 21052d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 21062d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 21072d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 21082d61bbb3SSatish Balay } else { 21092d61bbb3SSatish Balay info->fill_ratio_given = 0; 21102d61bbb3SSatish Balay info->fill_ratio_needed = 0; 21112d61bbb3SSatish Balay info->factor_mallocs = 0; 21122d61bbb3SSatish Balay } 21132d61bbb3SSatish Balay PetscFunctionReturn(0); 21142d61bbb3SSatish Balay } 21152d61bbb3SSatish Balay 21162d61bbb3SSatish Balay 21174a2ae208SSatish Balay #undef __FUNCT__ 21184a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2119dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 21202d61bbb3SSatish Balay { 21212d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2122dfbe8321SBarry Smith PetscErrorCode ierr; 21232d61bbb3SSatish Balay 21242d61bbb3SSatish Balay PetscFunctionBegin; 2125549d3d68SSatish Balay ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 21262d61bbb3SSatish Balay PetscFunctionReturn(0); 21272d61bbb3SSatish Balay } 2128