1*49b5e25fSSatish Balay /*$Id: SBAIJ2.c,v 1.55 2000/01/11 21:00:52 bsmith Exp $*/ 2*49b5e25fSSatish Balay 3*49b5e25fSSatish Balay #include "petscsys.h" 4*49b5e25fSSatish Balay #include "src/mat/impls/baij/seq/baij.h" 5*49b5e25fSSatish Balay #include "src/vec/vecimpl.h" 6*49b5e25fSSatish Balay #include "src/inline/spops.h" 7*49b5e25fSSatish Balay #include "src/inline/ilu.h" 8*49b5e25fSSatish Balay #include "petscbt.h" 9*49b5e25fSSatish Balay #include "sbaij.h" 10*49b5e25fSSatish Balay 11*49b5e25fSSatish Balay #undef __FUNC__ 12*49b5e25fSSatish Balay #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ" 13*49b5e25fSSatish Balay int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov) 14*49b5e25fSSatish Balay { 15*49b5e25fSSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 16*49b5e25fSSatish Balay int row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival; 17*49b5e25fSSatish Balay int start,end,*ai,*aj,bs,*nidx2; 18*49b5e25fSSatish Balay PetscBT table; 19*49b5e25fSSatish Balay 20*49b5e25fSSatish Balay PetscFunctionBegin; 21*49b5e25fSSatish Balay m = a->mbs; 22*49b5e25fSSatish Balay ai = a->i; 23*49b5e25fSSatish Balay aj = a->j; 24*49b5e25fSSatish Balay bs = a->bs; 25*49b5e25fSSatish Balay 26*49b5e25fSSatish Balay if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified"); 27*49b5e25fSSatish Balay 28*49b5e25fSSatish Balay ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 29*49b5e25fSSatish Balay nidx = (int*)PetscMalloc((m+1)*sizeof(int));CHKPTRQ(nidx); 30*49b5e25fSSatish Balay nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int));CHKPTRQ(nidx2); 31*49b5e25fSSatish Balay 32*49b5e25fSSatish Balay for (i=0; i<is_max; i++) { 33*49b5e25fSSatish Balay /* Initialise the two local arrays */ 34*49b5e25fSSatish Balay isz = 0; 35*49b5e25fSSatish Balay ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 36*49b5e25fSSatish Balay 37*49b5e25fSSatish Balay /* Extract the indices, assume there can be duplicate entries */ 38*49b5e25fSSatish Balay ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 39*49b5e25fSSatish Balay ierr = ISGetSize(is[i],&n);CHKERRQ(ierr); 40*49b5e25fSSatish Balay 41*49b5e25fSSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 42*49b5e25fSSatish Balay for (j=0; j<n ; ++j){ 43*49b5e25fSSatish Balay ival = idx[j]/bs; /* convert the indices into block indices */ 44*49b5e25fSSatish Balay if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim"); 45*49b5e25fSSatish Balay if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 46*49b5e25fSSatish Balay } 47*49b5e25fSSatish Balay ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48*49b5e25fSSatish Balay ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49*49b5e25fSSatish Balay k = 0; 50*49b5e25fSSatish Balay for (j=0; j<ov; j++){ /* for each overlap*/ 51*49b5e25fSSatish Balay n = isz; 52*49b5e25fSSatish Balay for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 53*49b5e25fSSatish Balay row = nidx[k]; 54*49b5e25fSSatish Balay start = ai[row]; 55*49b5e25fSSatish Balay end = ai[row+1]; 56*49b5e25fSSatish Balay for (l = start; l<end ; l++){ 57*49b5e25fSSatish Balay val = aj[l]; 58*49b5e25fSSatish Balay if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 59*49b5e25fSSatish Balay } 60*49b5e25fSSatish Balay } 61*49b5e25fSSatish Balay } 62*49b5e25fSSatish Balay /* expand the Index Set */ 63*49b5e25fSSatish Balay for (j=0; j<isz; j++) { 64*49b5e25fSSatish Balay for (k=0; k<bs; k++) 65*49b5e25fSSatish Balay nidx2[j*bs+k] = nidx[j]*bs+k; 66*49b5e25fSSatish Balay } 67*49b5e25fSSatish Balay ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 68*49b5e25fSSatish Balay } 69*49b5e25fSSatish Balay ierr = PetscBTDestroy(table);CHKERRQ(ierr); 70*49b5e25fSSatish Balay ierr = PetscFree(nidx);CHKERRQ(ierr); 71*49b5e25fSSatish Balay ierr = PetscFree(nidx2);CHKERRQ(ierr); 72*49b5e25fSSatish Balay PetscFunctionReturn(0); 73*49b5e25fSSatish Balay } 74*49b5e25fSSatish Balay 75*49b5e25fSSatish Balay #undef __FUNC__ 76*49b5e25fSSatish Balay #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private" 77*49b5e25fSSatish Balay int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 78*49b5e25fSSatish Balay { 79*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 80*49b5e25fSSatish Balay int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 81*49b5e25fSSatish Balay int row,mat_i,*mat_j,tcol,*mat_ilen; 82*49b5e25fSSatish Balay int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 83*49b5e25fSSatish Balay int *aj = a->j,*ai = a->i; 84*49b5e25fSSatish Balay MatScalar *mat_a; 85*49b5e25fSSatish Balay Mat C; 86*49b5e25fSSatish Balay PetscTruth flag; 87*49b5e25fSSatish Balay 88*49b5e25fSSatish Balay PetscFunctionBegin; 89*49b5e25fSSatish Balay 90*49b5e25fSSatish Balay if ( isrow != iscol ) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 91*49b5e25fSSatish Balay ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 92*49b5e25fSSatish Balay if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted"); 93*49b5e25fSSatish Balay 94*49b5e25fSSatish Balay ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 95*49b5e25fSSatish Balay ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 96*49b5e25fSSatish Balay 97*49b5e25fSSatish Balay smap = (int*)PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap); 98*49b5e25fSSatish Balay ssmap = smap; 99*49b5e25fSSatish Balay lens = (int*)PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens); 100*49b5e25fSSatish Balay ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 101*49b5e25fSSatish Balay for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 102*49b5e25fSSatish Balay /* determine lens of each row */ 103*49b5e25fSSatish Balay for (i=0; i<nrows; i++) { 104*49b5e25fSSatish Balay kstart = ai[irow[i]]; 105*49b5e25fSSatish Balay kend = kstart + a->ilen[irow[i]]; 106*49b5e25fSSatish Balay lens[i] = 0; 107*49b5e25fSSatish Balay for (k=kstart; k<kend; k++) { 108*49b5e25fSSatish Balay if (ssmap[aj[k]]) { 109*49b5e25fSSatish Balay lens[i]++; 110*49b5e25fSSatish Balay } 111*49b5e25fSSatish Balay } 112*49b5e25fSSatish Balay } 113*49b5e25fSSatish Balay /* Create and fill new matrix */ 114*49b5e25fSSatish Balay if (scall == MAT_REUSE_MATRIX) { 115*49b5e25fSSatish Balay c = (Mat_SeqSBAIJ *)((*B)->data); 116*49b5e25fSSatish Balay 117*49b5e25fSSatish Balay if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size"); 118*49b5e25fSSatish Balay ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 119*49b5e25fSSatish Balay if (flag == PETSC_FALSE) { 120*49b5e25fSSatish Balay SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros"); 121*49b5e25fSSatish Balay } 122*49b5e25fSSatish Balay ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 123*49b5e25fSSatish Balay C = *B; 124*49b5e25fSSatish Balay } else { 125*49b5e25fSSatish Balay ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr); 126*49b5e25fSSatish Balay } 127*49b5e25fSSatish Balay c = (Mat_SeqSBAIJ *)(C->data); 128*49b5e25fSSatish Balay for (i=0; i<nrows; i++) { 129*49b5e25fSSatish Balay row = irow[i]; 130*49b5e25fSSatish Balay kstart = ai[row]; 131*49b5e25fSSatish Balay kend = kstart + a->ilen[row]; 132*49b5e25fSSatish Balay mat_i = c->i[i]; 133*49b5e25fSSatish Balay mat_j = c->j + mat_i; 134*49b5e25fSSatish Balay mat_a = c->a + mat_i*bs2; 135*49b5e25fSSatish Balay mat_ilen = c->ilen + i; 136*49b5e25fSSatish Balay for (k=kstart; k<kend; k++) { 137*49b5e25fSSatish Balay if ((tcol=ssmap[a->j[k]])) { 138*49b5e25fSSatish Balay *mat_j++ = tcol - 1; 139*49b5e25fSSatish Balay ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 140*49b5e25fSSatish Balay mat_a += bs2; 141*49b5e25fSSatish Balay (*mat_ilen)++; 142*49b5e25fSSatish Balay } 143*49b5e25fSSatish Balay } 144*49b5e25fSSatish Balay } 145*49b5e25fSSatish Balay 146*49b5e25fSSatish Balay /* Free work space */ 147*49b5e25fSSatish Balay ierr = PetscFree(smap);CHKERRQ(ierr); 148*49b5e25fSSatish Balay ierr = PetscFree(lens);CHKERRQ(ierr); 149*49b5e25fSSatish Balay ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 150*49b5e25fSSatish Balay ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 151*49b5e25fSSatish Balay 152*49b5e25fSSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 153*49b5e25fSSatish Balay *B = C; 154*49b5e25fSSatish Balay PetscFunctionReturn(0); 155*49b5e25fSSatish Balay } 156*49b5e25fSSatish Balay 157*49b5e25fSSatish Balay #undef __FUNC__ 158*49b5e25fSSatish Balay #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ" 159*49b5e25fSSatish Balay int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 160*49b5e25fSSatish Balay { 161*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 162*49b5e25fSSatish Balay IS is1; 163*49b5e25fSSatish Balay int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 164*49b5e25fSSatish Balay 165*49b5e25fSSatish Balay PetscFunctionBegin; 166*49b5e25fSSatish Balay if ( isrow != iscol ) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 167*49b5e25fSSatish Balay 168*49b5e25fSSatish Balay ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 169*49b5e25fSSatish Balay ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 170*49b5e25fSSatish Balay 171*49b5e25fSSatish Balay /* Verify if the indices corespond to each element in a block 172*49b5e25fSSatish Balay and form the IS with compressed IS */ 173*49b5e25fSSatish Balay vary = (int*)PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary); 174*49b5e25fSSatish Balay iary = vary + a->mbs; 175*49b5e25fSSatish Balay ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 176*49b5e25fSSatish Balay for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 177*49b5e25fSSatish Balay 178*49b5e25fSSatish Balay count = 0; 179*49b5e25fSSatish Balay for (i=0; i<a->mbs; i++) { 180*49b5e25fSSatish Balay if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks"); 181*49b5e25fSSatish Balay if (vary[i]==bs) iary[count++] = i; 182*49b5e25fSSatish Balay } 183*49b5e25fSSatish Balay ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 184*49b5e25fSSatish Balay 185*49b5e25fSSatish Balay ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 186*49b5e25fSSatish Balay ierr = PetscFree(vary);CHKERRQ(ierr); 187*49b5e25fSSatish Balay 188*49b5e25fSSatish Balay ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 189*49b5e25fSSatish Balay ISDestroy(is1); 190*49b5e25fSSatish Balay PetscFunctionReturn(0); 191*49b5e25fSSatish Balay } 192*49b5e25fSSatish Balay 193*49b5e25fSSatish Balay #undef __FUNC__ 194*49b5e25fSSatish Balay #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ" 195*49b5e25fSSatish Balay int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B) 196*49b5e25fSSatish Balay { 197*49b5e25fSSatish Balay int ierr,i; 198*49b5e25fSSatish Balay 199*49b5e25fSSatish Balay PetscFunctionBegin; 200*49b5e25fSSatish Balay if (scall == MAT_INITIAL_MATRIX) { 201*49b5e25fSSatish Balay *B = (Mat*)PetscMalloc((n+1)*sizeof(Mat));CHKPTRQ(*B); 202*49b5e25fSSatish Balay } 203*49b5e25fSSatish Balay 204*49b5e25fSSatish Balay for (i=0; i<n; i++) { 205*49b5e25fSSatish Balay ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 206*49b5e25fSSatish Balay } 207*49b5e25fSSatish Balay PetscFunctionReturn(0); 208*49b5e25fSSatish Balay } 209*49b5e25fSSatish Balay 210*49b5e25fSSatish Balay /* -------------------------------------------------------*/ 211*49b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */ 212*49b5e25fSSatish Balay /* -------------------------------------------------------*/ 213*49b5e25fSSatish Balay #include "pinclude/blaslapack.h" 214*49b5e25fSSatish Balay 215*49b5e25fSSatish Balay #undef __FUNC__ 216*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_1" 217*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 218*49b5e25fSSatish Balay { 219*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 220*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,zero=0.0; 221*49b5e25fSSatish Balay MatScalar *v; 222*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 223*49b5e25fSSatish Balay 224*49b5e25fSSatish Balay PetscFunctionBegin; 225*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 226*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 227*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 228*49b5e25fSSatish Balay 229*49b5e25fSSatish Balay v = a->a; 230*49b5e25fSSatish Balay xb = x; 231*49b5e25fSSatish Balay 232*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 233*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 234*49b5e25fSSatish Balay x1 = xb[0]; 235*49b5e25fSSatish Balay ib = aj + *ai; 236*49b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (diag of A)*x */ 237*49b5e25fSSatish Balay for (j=1; j<n; j++) { 238*49b5e25fSSatish Balay cval = *ib; 239*49b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 240*49b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 241*49b5e25fSSatish Balay } 242*49b5e25fSSatish Balay xb++; ai++; 243*49b5e25fSSatish Balay } 244*49b5e25fSSatish Balay 245*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 246*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 247*49b5e25fSSatish Balay PLogFlops(2*(a->s_nz*2 - a->m) - a->m); /* s_nz = (nz+m)/2 */ 248*49b5e25fSSatish Balay PetscFunctionReturn(0); 249*49b5e25fSSatish Balay } 250*49b5e25fSSatish Balay 251*49b5e25fSSatish Balay #undef __FUNC__ 252*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_2" 253*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 254*49b5e25fSSatish Balay { 255*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 256*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,x2,zero=0.0; 257*49b5e25fSSatish Balay MatScalar *v; 258*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 259*49b5e25fSSatish Balay 260*49b5e25fSSatish Balay 261*49b5e25fSSatish Balay PetscFunctionBegin; 262*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 263*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 264*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 265*49b5e25fSSatish Balay 266*49b5e25fSSatish Balay v = a->a; 267*49b5e25fSSatish Balay xb = x; 268*49b5e25fSSatish Balay 269*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 270*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 271*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 272*49b5e25fSSatish Balay ib = aj + *ai; 273*49b5e25fSSatish Balay /* (diag of A)*x */ 274*49b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 275*49b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 276*49b5e25fSSatish Balay v += 4; 277*49b5e25fSSatish Balay for (j=1; j<n; j++) { 278*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 279*49b5e25fSSatish Balay cval = ib[j]*2; 280*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 281*49b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 282*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 283*49b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 284*49b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 285*49b5e25fSSatish Balay v += 4; 286*49b5e25fSSatish Balay } 287*49b5e25fSSatish Balay xb +=2; ai++; 288*49b5e25fSSatish Balay } 289*49b5e25fSSatish Balay 290*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 291*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 292*49b5e25fSSatish Balay PLogFlops(8*(a->s_nz*2 - a->m) - a->m); 293*49b5e25fSSatish Balay PetscFunctionReturn(0); 294*49b5e25fSSatish Balay } 295*49b5e25fSSatish Balay 296*49b5e25fSSatish Balay #undef __FUNC__ 297*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_3" 298*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 299*49b5e25fSSatish Balay { 300*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 301*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,x2,x3,zero=0.0; 302*49b5e25fSSatish Balay MatScalar *v; 303*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 304*49b5e25fSSatish Balay 305*49b5e25fSSatish Balay 306*49b5e25fSSatish Balay PetscFunctionBegin; 307*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 308*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 309*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 310*49b5e25fSSatish Balay 311*49b5e25fSSatish Balay v = a->a; 312*49b5e25fSSatish Balay xb = x; 313*49b5e25fSSatish Balay 314*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 315*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 316*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 317*49b5e25fSSatish Balay ib = aj + *ai; 318*49b5e25fSSatish Balay /* (diag of A)*x */ 319*49b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 320*49b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 321*49b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 322*49b5e25fSSatish Balay v += 9; 323*49b5e25fSSatish Balay for (j=1; j<n; j++) { 324*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 325*49b5e25fSSatish Balay cval = ib[j]*3; 326*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 327*49b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 328*49b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 329*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 330*49b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 331*49b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 332*49b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 333*49b5e25fSSatish Balay v += 9; 334*49b5e25fSSatish Balay } 335*49b5e25fSSatish Balay xb +=3; ai++; 336*49b5e25fSSatish Balay } 337*49b5e25fSSatish Balay 338*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 339*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 340*49b5e25fSSatish Balay PLogFlops(18*(a->s_nz*2 - a->m) - a->m); 341*49b5e25fSSatish Balay PetscFunctionReturn(0); 342*49b5e25fSSatish Balay } 343*49b5e25fSSatish Balay 344*49b5e25fSSatish Balay #undef __FUNC__ 345*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_4" 346*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 347*49b5e25fSSatish Balay { 348*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 349*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 350*49b5e25fSSatish Balay MatScalar *v; 351*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 352*49b5e25fSSatish Balay 353*49b5e25fSSatish Balay PetscFunctionBegin; 354*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 355*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 356*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 357*49b5e25fSSatish Balay 358*49b5e25fSSatish Balay v = a->a; 359*49b5e25fSSatish Balay xb = x; 360*49b5e25fSSatish Balay 361*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 362*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 363*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 364*49b5e25fSSatish Balay ib = aj + *ai; 365*49b5e25fSSatish Balay /* (diag of A)*x */ 366*49b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 367*49b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 368*49b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 369*49b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 370*49b5e25fSSatish Balay v += 16; 371*49b5e25fSSatish Balay for (j=1; j<n; j++) { 372*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 373*49b5e25fSSatish Balay cval = ib[j]*4; 374*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 375*49b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 376*49b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 377*49b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 378*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 379*49b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 380*49b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 381*49b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 382*49b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 383*49b5e25fSSatish Balay v += 16; 384*49b5e25fSSatish Balay } 385*49b5e25fSSatish Balay xb +=4; ai++; 386*49b5e25fSSatish Balay } 387*49b5e25fSSatish Balay 388*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 389*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 390*49b5e25fSSatish Balay PLogFlops(32*(a->s_nz*2 - a->m) - a->m); 391*49b5e25fSSatish Balay PetscFunctionReturn(0); 392*49b5e25fSSatish Balay } 393*49b5e25fSSatish Balay 394*49b5e25fSSatish Balay #undef __FUNC__ 395*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_5" 396*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 397*49b5e25fSSatish Balay { 398*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 399*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 400*49b5e25fSSatish Balay MatScalar *v; 401*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 402*49b5e25fSSatish Balay 403*49b5e25fSSatish Balay PetscFunctionBegin; 404*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 405*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 406*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 407*49b5e25fSSatish Balay 408*49b5e25fSSatish Balay v = a->a; 409*49b5e25fSSatish Balay xb = x; 410*49b5e25fSSatish Balay 411*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 412*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 413*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 414*49b5e25fSSatish Balay ib = aj + *ai; 415*49b5e25fSSatish Balay /* (diag of A)*x */ 416*49b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 417*49b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 418*49b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 419*49b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 420*49b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 421*49b5e25fSSatish Balay v += 25; 422*49b5e25fSSatish Balay for (j=1; j<n; j++) { 423*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 424*49b5e25fSSatish Balay cval = ib[j]*5; 425*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 426*49b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 427*49b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 428*49b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 429*49b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 430*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 431*49b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 432*49b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 433*49b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 434*49b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 435*49b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 436*49b5e25fSSatish Balay v += 25; 437*49b5e25fSSatish Balay } 438*49b5e25fSSatish Balay xb +=5; ai++; 439*49b5e25fSSatish Balay } 440*49b5e25fSSatish Balay 441*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 442*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 443*49b5e25fSSatish Balay PLogFlops(50*(a->s_nz*2 - a->m) - a->m); 444*49b5e25fSSatish Balay PetscFunctionReturn(0); 445*49b5e25fSSatish Balay } 446*49b5e25fSSatish Balay 447*49b5e25fSSatish Balay 448*49b5e25fSSatish Balay #undef __FUNC__ 449*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_6" 450*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 451*49b5e25fSSatish Balay { 452*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 453*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 454*49b5e25fSSatish Balay MatScalar *v; 455*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 456*49b5e25fSSatish Balay 457*49b5e25fSSatish Balay PetscFunctionBegin; 458*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 459*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 460*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 461*49b5e25fSSatish Balay 462*49b5e25fSSatish Balay v = a->a; 463*49b5e25fSSatish Balay xb = x; 464*49b5e25fSSatish Balay 465*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 466*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 467*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 468*49b5e25fSSatish Balay ib = aj + *ai; 469*49b5e25fSSatish Balay /* (diag of A)*x */ 470*49b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 471*49b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 472*49b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 473*49b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 474*49b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 475*49b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 476*49b5e25fSSatish Balay v += 36; 477*49b5e25fSSatish Balay for (j=1; j<n; j++) { 478*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 479*49b5e25fSSatish Balay cval = ib[j]*6; 480*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 481*49b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 482*49b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 483*49b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 484*49b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 485*49b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 486*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 487*49b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 488*49b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 489*49b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 490*49b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 491*49b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 492*49b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 493*49b5e25fSSatish Balay v += 36; 494*49b5e25fSSatish Balay } 495*49b5e25fSSatish Balay xb +=6; ai++; 496*49b5e25fSSatish Balay } 497*49b5e25fSSatish Balay 498*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 499*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 500*49b5e25fSSatish Balay PLogFlops(72*(a->s_nz*2 - a->m) - a->m); 501*49b5e25fSSatish Balay PetscFunctionReturn(0); 502*49b5e25fSSatish Balay } 503*49b5e25fSSatish Balay #undef __FUNC__ 504*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_7" 505*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 506*49b5e25fSSatish Balay { 507*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 508*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 509*49b5e25fSSatish Balay MatScalar *v; 510*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 511*49b5e25fSSatish Balay 512*49b5e25fSSatish Balay PetscFunctionBegin; 513*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 514*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 515*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 516*49b5e25fSSatish Balay 517*49b5e25fSSatish Balay v = a->a; 518*49b5e25fSSatish Balay xb = x; 519*49b5e25fSSatish Balay 520*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 521*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 522*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 523*49b5e25fSSatish Balay ib = aj + *ai; 524*49b5e25fSSatish Balay /* (diag of A)*x */ 525*49b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 526*49b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 527*49b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 528*49b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 529*49b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 530*49b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 531*49b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 532*49b5e25fSSatish Balay v += 49; 533*49b5e25fSSatish Balay for (j=1; j<n; j++) { 534*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 535*49b5e25fSSatish Balay cval = ib[j]*7; 536*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 537*49b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 538*49b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 539*49b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 540*49b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 541*49b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 542*49b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 543*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 544*49b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 545*49b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 546*49b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 547*49b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 548*49b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 549*49b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 550*49b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 551*49b5e25fSSatish Balay v += 49; 552*49b5e25fSSatish Balay } 553*49b5e25fSSatish Balay xb +=7; ai++; 554*49b5e25fSSatish Balay } 555*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 556*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 557*49b5e25fSSatish Balay PLogFlops(98*(a->s_nz*2 - a->m) - a->m); 558*49b5e25fSSatish Balay PetscFunctionReturn(0); 559*49b5e25fSSatish Balay } 560*49b5e25fSSatish Balay 561*49b5e25fSSatish Balay /* 562*49b5e25fSSatish Balay This will not work with MatScalar == float because it calls the BLAS 563*49b5e25fSSatish Balay */ 564*49b5e25fSSatish Balay #undef __FUNC__ 565*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_N" 566*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 567*49b5e25fSSatish Balay { 568*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 569*49b5e25fSSatish Balay Scalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,*diag_ptr,zero=0.0; 570*49b5e25fSSatish Balay MatScalar *v,*vwk; 571*49b5e25fSSatish Balay int ierr,mbs=a->mbs,i,*idx,*idxt,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 572*49b5e25fSSatish Balay int ncols,k,col; 573*49b5e25fSSatish Balay 574*49b5e25fSSatish Balay PetscFunctionBegin; 575*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 576*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 577*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 578*49b5e25fSSatish Balay 579*49b5e25fSSatish Balay aj = a->j; 580*49b5e25fSSatish Balay v = a->a; 581*49b5e25fSSatish Balay ii = a->i; 582*49b5e25fSSatish Balay 583*49b5e25fSSatish Balay if (!a->mult_work) { 584*49b5e25fSSatish Balay k = a->m; 585*49b5e25fSSatish Balay a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work); 586*49b5e25fSSatish Balay } 587*49b5e25fSSatish Balay work = a->mult_work; 588*49b5e25fSSatish Balay 589*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 590*49b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 591*49b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 592*49b5e25fSSatish Balay 593*49b5e25fSSatish Balay /* upper triangular part */ 594*49b5e25fSSatish Balay for (j=0; j<n; j++) { 595*49b5e25fSSatish Balay col = bs*(*idx); 596*49b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 597*49b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 598*49b5e25fSSatish Balay workt += bs; 599*49b5e25fSSatish Balay } 600*49b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 601*49b5e25fSSatish Balay Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 602*49b5e25fSSatish Balay 603*49b5e25fSSatish Balay /* strict lower triangular part */ 604*49b5e25fSSatish Balay ncols -= bs; 605*49b5e25fSSatish Balay if (ncols > 0){ 606*49b5e25fSSatish Balay workt = work; 607*49b5e25fSSatish Balay ierr = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr); 608*49b5e25fSSatish Balay Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt); 609*49b5e25fSSatish Balay 610*49b5e25fSSatish Balay idx=aj+ii[0]+1; 611*49b5e25fSSatish Balay for (j=1; j<n; j++) { 612*49b5e25fSSatish Balay zb = z_ptr + bs*(*idx); 613*49b5e25fSSatish Balay idx++; 614*49b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k] ; 615*49b5e25fSSatish Balay workt += bs; 616*49b5e25fSSatish Balay } 617*49b5e25fSSatish Balay } 618*49b5e25fSSatish Balay 619*49b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 620*49b5e25fSSatish Balay } 621*49b5e25fSSatish Balay 622*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 623*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 624*49b5e25fSSatish Balay PLogFlops(2*(a->s_nz*2 - a->m)*bs2 - a->m); 625*49b5e25fSSatish Balay PetscFunctionReturn(0); 626*49b5e25fSSatish Balay } 627*49b5e25fSSatish Balay 628*49b5e25fSSatish Balay #undef __FUNC__ 629*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_1" 630*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 631*49b5e25fSSatish Balay { 632*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 633*49b5e25fSSatish Balay Scalar *x,*y,*z,*xb,x1; 634*49b5e25fSSatish Balay MatScalar *v; 635*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 636*49b5e25fSSatish Balay 637*49b5e25fSSatish Balay PetscFunctionBegin; 638*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 639*49b5e25fSSatish Balay if (yy != xx) { 640*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 641*49b5e25fSSatish Balay } else { 642*49b5e25fSSatish Balay y = x; 643*49b5e25fSSatish Balay } 644*49b5e25fSSatish Balay if (zz != yy) { 645*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 646*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 647*49b5e25fSSatish Balay } else { 648*49b5e25fSSatish Balay z = y; 649*49b5e25fSSatish Balay } 650*49b5e25fSSatish Balay 651*49b5e25fSSatish Balay v = a->a; 652*49b5e25fSSatish Balay xb = x; 653*49b5e25fSSatish Balay 654*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 655*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th row of A */ 656*49b5e25fSSatish Balay x1 = xb[0]; 657*49b5e25fSSatish Balay ib = aj + *ai; 658*49b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (diag of A)*x */ 659*49b5e25fSSatish Balay for (j=1; j<n; j++) { 660*49b5e25fSSatish Balay cval = *ib; 661*49b5e25fSSatish Balay z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 662*49b5e25fSSatish Balay z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 663*49b5e25fSSatish Balay } 664*49b5e25fSSatish Balay xb++; ai++; 665*49b5e25fSSatish Balay } 666*49b5e25fSSatish Balay 667*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 668*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 669*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 670*49b5e25fSSatish Balay 671*49b5e25fSSatish Balay PLogFlops(2*(a->s_nz*2 - a->m)); 672*49b5e25fSSatish Balay PetscFunctionReturn(0); 673*49b5e25fSSatish Balay } 674*49b5e25fSSatish Balay 675*49b5e25fSSatish Balay #undef __FUNC__ 676*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_2" 677*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 678*49b5e25fSSatish Balay { 679*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 680*49b5e25fSSatish Balay Scalar *x,*y,*z,*xb,x1,x2; 681*49b5e25fSSatish Balay MatScalar *v; 682*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 683*49b5e25fSSatish Balay 684*49b5e25fSSatish Balay PetscFunctionBegin; 685*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 686*49b5e25fSSatish Balay if (yy != xx) { 687*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 688*49b5e25fSSatish Balay } else { 689*49b5e25fSSatish Balay y = x; 690*49b5e25fSSatish Balay } 691*49b5e25fSSatish Balay if (zz != yy) { 692*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 693*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 694*49b5e25fSSatish Balay } else { 695*49b5e25fSSatish Balay z = y; 696*49b5e25fSSatish Balay } 697*49b5e25fSSatish Balay 698*49b5e25fSSatish Balay v = a->a; 699*49b5e25fSSatish Balay xb = x; 700*49b5e25fSSatish Balay 701*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 702*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 703*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 704*49b5e25fSSatish Balay ib = aj + *ai; 705*49b5e25fSSatish Balay /* (diag of A)*x */ 706*49b5e25fSSatish Balay z[2*i] += v[0]*x1 + v[2]*x2; 707*49b5e25fSSatish Balay z[2*i+1] += v[2]*x1 + v[3]*x2; 708*49b5e25fSSatish Balay v += 4; 709*49b5e25fSSatish Balay for (j=1; j<n; j++) { 710*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 711*49b5e25fSSatish Balay cval = ib[j]*2; 712*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2; 713*49b5e25fSSatish Balay z[cval+1] += v[2]*x1 + v[3]*x2; 714*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 715*49b5e25fSSatish Balay z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 716*49b5e25fSSatish Balay z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 717*49b5e25fSSatish Balay v += 4; 718*49b5e25fSSatish Balay } 719*49b5e25fSSatish Balay xb +=2; ai++; 720*49b5e25fSSatish Balay } 721*49b5e25fSSatish Balay 722*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 723*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 724*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 725*49b5e25fSSatish Balay 726*49b5e25fSSatish Balay PLogFlops(4*(a->s_nz*2 - a->m)); 727*49b5e25fSSatish Balay PetscFunctionReturn(0); 728*49b5e25fSSatish Balay } 729*49b5e25fSSatish Balay 730*49b5e25fSSatish Balay #undef __FUNC__ 731*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_3" 732*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 733*49b5e25fSSatish Balay { 734*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 735*49b5e25fSSatish Balay Scalar *x,*y,*z,*xb,x1,x2,x3; 736*49b5e25fSSatish Balay MatScalar *v; 737*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 738*49b5e25fSSatish Balay 739*49b5e25fSSatish Balay PetscFunctionBegin; 740*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 741*49b5e25fSSatish Balay if (yy != xx) { 742*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 743*49b5e25fSSatish Balay } else { 744*49b5e25fSSatish Balay y = x; 745*49b5e25fSSatish Balay } 746*49b5e25fSSatish Balay if (zz != yy) { 747*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 748*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 749*49b5e25fSSatish Balay } else { 750*49b5e25fSSatish Balay z = y; 751*49b5e25fSSatish Balay } 752*49b5e25fSSatish Balay 753*49b5e25fSSatish Balay v = a->a; 754*49b5e25fSSatish Balay xb = x; 755*49b5e25fSSatish Balay 756*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 757*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 758*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 759*49b5e25fSSatish Balay ib = aj + *ai; 760*49b5e25fSSatish Balay /* (diag of A)*x */ 761*49b5e25fSSatish Balay z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 762*49b5e25fSSatish Balay z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 763*49b5e25fSSatish Balay z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 764*49b5e25fSSatish Balay v += 9; 765*49b5e25fSSatish Balay for (j=1; j<n; j++) { 766*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 767*49b5e25fSSatish Balay cval = ib[j]*3; 768*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 769*49b5e25fSSatish Balay z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 770*49b5e25fSSatish Balay z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 771*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 772*49b5e25fSSatish Balay z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 773*49b5e25fSSatish Balay z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 774*49b5e25fSSatish Balay z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 775*49b5e25fSSatish Balay v += 9; 776*49b5e25fSSatish Balay } 777*49b5e25fSSatish Balay xb +=3; ai++; 778*49b5e25fSSatish Balay } 779*49b5e25fSSatish Balay 780*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 781*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 782*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 783*49b5e25fSSatish Balay 784*49b5e25fSSatish Balay PLogFlops(18*(a->s_nz*2 - a->m)); 785*49b5e25fSSatish Balay PetscFunctionReturn(0); 786*49b5e25fSSatish Balay } 787*49b5e25fSSatish Balay 788*49b5e25fSSatish Balay #undef __FUNC__ 789*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_4" 790*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 791*49b5e25fSSatish Balay { 792*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 793*49b5e25fSSatish Balay Scalar *x,*y,*z,*xb,x1,x2,x3,x4; 794*49b5e25fSSatish Balay MatScalar *v; 795*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 796*49b5e25fSSatish Balay 797*49b5e25fSSatish Balay PetscFunctionBegin; 798*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799*49b5e25fSSatish Balay if (yy != xx) { 800*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 801*49b5e25fSSatish Balay } else { 802*49b5e25fSSatish Balay y = x; 803*49b5e25fSSatish Balay } 804*49b5e25fSSatish Balay if (zz != yy) { 805*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 806*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 807*49b5e25fSSatish Balay } else { 808*49b5e25fSSatish Balay z = y; 809*49b5e25fSSatish Balay } 810*49b5e25fSSatish Balay 811*49b5e25fSSatish Balay v = a->a; 812*49b5e25fSSatish Balay xb = x; 813*49b5e25fSSatish Balay 814*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 815*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 816*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 817*49b5e25fSSatish Balay ib = aj + *ai; 818*49b5e25fSSatish Balay /* (diag of A)*x */ 819*49b5e25fSSatish Balay z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 820*49b5e25fSSatish Balay z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 821*49b5e25fSSatish Balay z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 822*49b5e25fSSatish Balay z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 823*49b5e25fSSatish Balay v += 16; 824*49b5e25fSSatish Balay for (j=1; j<n; j++) { 825*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 826*49b5e25fSSatish Balay cval = ib[j]*4; 827*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 828*49b5e25fSSatish Balay z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 829*49b5e25fSSatish Balay z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 830*49b5e25fSSatish Balay z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 831*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 832*49b5e25fSSatish Balay z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 833*49b5e25fSSatish Balay z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 834*49b5e25fSSatish Balay z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 835*49b5e25fSSatish Balay z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 836*49b5e25fSSatish Balay v += 16; 837*49b5e25fSSatish Balay } 838*49b5e25fSSatish Balay xb +=4; ai++; 839*49b5e25fSSatish Balay } 840*49b5e25fSSatish Balay 841*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 842*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 843*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 844*49b5e25fSSatish Balay 845*49b5e25fSSatish Balay PLogFlops(32*(a->s_nz*2 - a->m)); 846*49b5e25fSSatish Balay PetscFunctionReturn(0); 847*49b5e25fSSatish Balay } 848*49b5e25fSSatish Balay 849*49b5e25fSSatish Balay #undef __FUNC__ 850*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_5" 851*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 852*49b5e25fSSatish Balay { 853*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 854*49b5e25fSSatish Balay Scalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 855*49b5e25fSSatish Balay MatScalar *v; 856*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 857*49b5e25fSSatish Balay 858*49b5e25fSSatish Balay PetscFunctionBegin; 859*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 860*49b5e25fSSatish Balay if (yy != xx) { 861*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 862*49b5e25fSSatish Balay } else { 863*49b5e25fSSatish Balay y = x; 864*49b5e25fSSatish Balay } 865*49b5e25fSSatish Balay if (zz != yy) { 866*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 867*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 868*49b5e25fSSatish Balay } else { 869*49b5e25fSSatish Balay z = y; 870*49b5e25fSSatish Balay } 871*49b5e25fSSatish Balay 872*49b5e25fSSatish Balay v = a->a; 873*49b5e25fSSatish Balay xb = x; 874*49b5e25fSSatish Balay 875*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 876*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 877*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 878*49b5e25fSSatish Balay ib = aj + *ai; 879*49b5e25fSSatish Balay /* (diag of A)*x */ 880*49b5e25fSSatish Balay z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 881*49b5e25fSSatish Balay z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 882*49b5e25fSSatish Balay z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 883*49b5e25fSSatish Balay z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 884*49b5e25fSSatish Balay z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 885*49b5e25fSSatish Balay v += 25; 886*49b5e25fSSatish Balay for (j=1; j<n; j++) { 887*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 888*49b5e25fSSatish Balay cval = ib[j]*5; 889*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 890*49b5e25fSSatish Balay z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 891*49b5e25fSSatish Balay z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 892*49b5e25fSSatish Balay z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 893*49b5e25fSSatish Balay z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 894*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 895*49b5e25fSSatish Balay z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 896*49b5e25fSSatish Balay z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 897*49b5e25fSSatish Balay z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 898*49b5e25fSSatish Balay z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 899*49b5e25fSSatish Balay z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 900*49b5e25fSSatish Balay v += 25; 901*49b5e25fSSatish Balay } 902*49b5e25fSSatish Balay xb +=5; ai++; 903*49b5e25fSSatish Balay } 904*49b5e25fSSatish Balay 905*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 906*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 907*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 908*49b5e25fSSatish Balay 909*49b5e25fSSatish Balay PLogFlops(50*(a->s_nz*2 - a->m)); 910*49b5e25fSSatish Balay PetscFunctionReturn(0); 911*49b5e25fSSatish Balay } 912*49b5e25fSSatish Balay #undef __FUNC__ 913*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_6" 914*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 915*49b5e25fSSatish Balay { 916*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 917*49b5e25fSSatish Balay Scalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 918*49b5e25fSSatish Balay MatScalar *v; 919*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 920*49b5e25fSSatish Balay 921*49b5e25fSSatish Balay PetscFunctionBegin; 922*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 923*49b5e25fSSatish Balay if (yy != xx) { 924*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 925*49b5e25fSSatish Balay } else { 926*49b5e25fSSatish Balay y = x; 927*49b5e25fSSatish Balay } 928*49b5e25fSSatish Balay if (zz != yy) { 929*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 930*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 931*49b5e25fSSatish Balay } else { 932*49b5e25fSSatish Balay z = y; 933*49b5e25fSSatish Balay } 934*49b5e25fSSatish Balay 935*49b5e25fSSatish Balay v = a->a; 936*49b5e25fSSatish Balay xb = x; 937*49b5e25fSSatish Balay 938*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 939*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 940*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 941*49b5e25fSSatish Balay ib = aj + *ai; 942*49b5e25fSSatish Balay /* (diag of A)*x */ 943*49b5e25fSSatish Balay z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 944*49b5e25fSSatish Balay z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 945*49b5e25fSSatish Balay z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 946*49b5e25fSSatish Balay z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 947*49b5e25fSSatish Balay z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 948*49b5e25fSSatish Balay z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 949*49b5e25fSSatish Balay v += 36; 950*49b5e25fSSatish Balay for (j=1; j<n; j++) { 951*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 952*49b5e25fSSatish Balay cval = ib[j]*6; 953*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 954*49b5e25fSSatish Balay z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 955*49b5e25fSSatish Balay z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 956*49b5e25fSSatish Balay z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 957*49b5e25fSSatish Balay z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 958*49b5e25fSSatish Balay z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 959*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 960*49b5e25fSSatish Balay z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 961*49b5e25fSSatish Balay z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 962*49b5e25fSSatish Balay z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 963*49b5e25fSSatish Balay z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 964*49b5e25fSSatish Balay z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 965*49b5e25fSSatish Balay z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 966*49b5e25fSSatish Balay v += 36; 967*49b5e25fSSatish Balay } 968*49b5e25fSSatish Balay xb +=6; ai++; 969*49b5e25fSSatish Balay } 970*49b5e25fSSatish Balay 971*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 972*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 973*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 974*49b5e25fSSatish Balay 975*49b5e25fSSatish Balay PLogFlops(72*(a->s_nz*2 - a->m)); 976*49b5e25fSSatish Balay PetscFunctionReturn(0); 977*49b5e25fSSatish Balay } 978*49b5e25fSSatish Balay 979*49b5e25fSSatish Balay #undef __FUNC__ 980*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_7" 981*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 982*49b5e25fSSatish Balay { 983*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 984*49b5e25fSSatish Balay Scalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 985*49b5e25fSSatish Balay MatScalar *v; 986*49b5e25fSSatish Balay int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j; 987*49b5e25fSSatish Balay 988*49b5e25fSSatish Balay PetscFunctionBegin; 989*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 990*49b5e25fSSatish Balay if (yy != xx) { 991*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 992*49b5e25fSSatish Balay } else { 993*49b5e25fSSatish Balay y = x; 994*49b5e25fSSatish Balay } 995*49b5e25fSSatish Balay if (zz != yy) { 996*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 997*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 998*49b5e25fSSatish Balay } else { 999*49b5e25fSSatish Balay z = y; 1000*49b5e25fSSatish Balay } 1001*49b5e25fSSatish Balay 1002*49b5e25fSSatish Balay v = a->a; 1003*49b5e25fSSatish Balay xb = x; 1004*49b5e25fSSatish Balay 1005*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1006*49b5e25fSSatish Balay n = ai[1] - ai[0]; /* length of i_th block row of A */ 1007*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1008*49b5e25fSSatish Balay ib = aj + *ai; 1009*49b5e25fSSatish Balay /* (diag of A)*x */ 1010*49b5e25fSSatish Balay z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1011*49b5e25fSSatish Balay z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1012*49b5e25fSSatish Balay z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1013*49b5e25fSSatish Balay z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1014*49b5e25fSSatish Balay z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1015*49b5e25fSSatish Balay z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1016*49b5e25fSSatish Balay z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1017*49b5e25fSSatish Balay v += 49; 1018*49b5e25fSSatish Balay for (j=1; j<n; j++) { 1019*49b5e25fSSatish Balay /* (strict lower triangular part of A)*x */ 1020*49b5e25fSSatish Balay cval = ib[j]*7; 1021*49b5e25fSSatish Balay z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1022*49b5e25fSSatish Balay z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1023*49b5e25fSSatish Balay z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1024*49b5e25fSSatish Balay z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1025*49b5e25fSSatish Balay z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1026*49b5e25fSSatish Balay z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1027*49b5e25fSSatish Balay z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1028*49b5e25fSSatish Balay /* (strict upper triangular part of A)*x */ 1029*49b5e25fSSatish Balay z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1030*49b5e25fSSatish Balay z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1031*49b5e25fSSatish Balay z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1032*49b5e25fSSatish Balay z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1033*49b5e25fSSatish Balay z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1034*49b5e25fSSatish Balay z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1035*49b5e25fSSatish Balay z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1036*49b5e25fSSatish Balay v += 49; 1037*49b5e25fSSatish Balay } 1038*49b5e25fSSatish Balay xb +=7; ai++; 1039*49b5e25fSSatish Balay } 1040*49b5e25fSSatish Balay 1041*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1042*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1043*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1044*49b5e25fSSatish Balay 1045*49b5e25fSSatish Balay PLogFlops(98*(a->s_nz*2 - a->m)); 1046*49b5e25fSSatish Balay PetscFunctionReturn(0); 1047*49b5e25fSSatish Balay } 1048*49b5e25fSSatish Balay 1049*49b5e25fSSatish Balay #undef __FUNC__ 1050*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_N" 1051*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1052*49b5e25fSSatish Balay { 1053*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1054*49b5e25fSSatish Balay Scalar *x,*x_ptr,*y,*z,*z_ptr,*xb,*zb,*work,*workt,*diag_ptr; 1055*49b5e25fSSatish Balay MatScalar *v,*vwk; 1056*49b5e25fSSatish Balay int ierr,mbs=a->mbs,i,*idx,*idxt,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1057*49b5e25fSSatish Balay int ncols,k,col; 1058*49b5e25fSSatish Balay 1059*49b5e25fSSatish Balay PetscFunctionBegin; 1060*49b5e25fSSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1061*49b5e25fSSatish Balay if (yy != xx) { 1062*49b5e25fSSatish Balay ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1063*49b5e25fSSatish Balay } else { 1064*49b5e25fSSatish Balay y = x; 1065*49b5e25fSSatish Balay } 1066*49b5e25fSSatish Balay if (zz != yy) { 1067*49b5e25fSSatish Balay ierr = VecCopy(yy,zz); CHKERRQ(ierr); 1068*49b5e25fSSatish Balay ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1069*49b5e25fSSatish Balay } else { 1070*49b5e25fSSatish Balay z = y; 1071*49b5e25fSSatish Balay } 1072*49b5e25fSSatish Balay 1073*49b5e25fSSatish Balay aj = a->j; 1074*49b5e25fSSatish Balay v = a->a; 1075*49b5e25fSSatish Balay ii = a->i; 1076*49b5e25fSSatish Balay 1077*49b5e25fSSatish Balay if (!a->mult_work) { 1078*49b5e25fSSatish Balay k = a->m; 1079*49b5e25fSSatish Balay a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work); 1080*49b5e25fSSatish Balay } 1081*49b5e25fSSatish Balay work = a->mult_work; 1082*49b5e25fSSatish Balay 1083*49b5e25fSSatish Balay 1084*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1085*49b5e25fSSatish Balay n = ii[1] - ii[0]; ncols = n*bs; 1086*49b5e25fSSatish Balay workt = work; idx=aj+ii[0]; 1087*49b5e25fSSatish Balay 1088*49b5e25fSSatish Balay /* upper triangular part */ 1089*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1090*49b5e25fSSatish Balay col = bs*(*idx); 1091*49b5e25fSSatish Balay xb = x_ptr + bs*(*idx++); 1092*49b5e25fSSatish Balay for (k=0; k<bs; k++) workt[k] = xb[k]; 1093*49b5e25fSSatish Balay workt += bs; 1094*49b5e25fSSatish Balay } 1095*49b5e25fSSatish Balay /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1096*49b5e25fSSatish Balay Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1097*49b5e25fSSatish Balay 1098*49b5e25fSSatish Balay /* strict lower triangular part */ 1099*49b5e25fSSatish Balay ncols -= bs; 1100*49b5e25fSSatish Balay if (ncols > 0){ 1101*49b5e25fSSatish Balay workt = work; 1102*49b5e25fSSatish Balay ierr = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr); 1103*49b5e25fSSatish Balay Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt); 1104*49b5e25fSSatish Balay 1105*49b5e25fSSatish Balay idx=aj+ii[0]+1; 1106*49b5e25fSSatish Balay for (j=1; j<n; j++) { 1107*49b5e25fSSatish Balay zb = z_ptr + bs*(*idx); 1108*49b5e25fSSatish Balay idx++; 1109*49b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k] ; 1110*49b5e25fSSatish Balay workt += bs; 1111*49b5e25fSSatish Balay } 1112*49b5e25fSSatish Balay } 1113*49b5e25fSSatish Balay 1114*49b5e25fSSatish Balay x += bs; v += n*bs2; z += bs; ii++; 1115*49b5e25fSSatish Balay } 1116*49b5e25fSSatish Balay 1117*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1118*49b5e25fSSatish Balay if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1119*49b5e25fSSatish Balay if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1120*49b5e25fSSatish Balay 1121*49b5e25fSSatish Balay PLogFlops(2*(a->s_nz*2 - a->m)); 1122*49b5e25fSSatish Balay PetscFunctionReturn(0); 1123*49b5e25fSSatish Balay } 1124*49b5e25fSSatish Balay 1125*49b5e25fSSatish Balay #undef __FUNC__ 1126*49b5e25fSSatish Balay #define __FUNC__ "MatMultTranspose_SeqSBAIJ" 1127*49b5e25fSSatish Balay int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1128*49b5e25fSSatish Balay { 1129*49b5e25fSSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1130*49b5e25fSSatish Balay Scalar *xg,*zg,*zb,zero = 0.0; 1131*49b5e25fSSatish Balay Scalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1132*49b5e25fSSatish Balay MatScalar *v; 1133*49b5e25fSSatish Balay int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval; 1134*49b5e25fSSatish Balay int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 1135*49b5e25fSSatish Balay 1136*49b5e25fSSatish Balay PetscFunctionBegin; 1137*49b5e25fSSatish Balay ierr = VecSet(&zero,zz);CHKERRQ(ierr); 1138*49b5e25fSSatish Balay ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg; 1139*49b5e25fSSatish Balay ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg; 1140*49b5e25fSSatish Balay 1141*49b5e25fSSatish Balay idx = a->j; 1142*49b5e25fSSatish Balay v = a->a; 1143*49b5e25fSSatish Balay ii = a->i; 1144*49b5e25fSSatish Balay xb = x; 1145*49b5e25fSSatish Balay switch (bs) { 1146*49b5e25fSSatish Balay case 1: 1147*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1148*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1149*49b5e25fSSatish Balay x1 = xb[0]; 1150*49b5e25fSSatish Balay ib = idx + ai[i]; 1151*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1152*49b5e25fSSatish Balay rval = ib[j]; 1153*49b5e25fSSatish Balay z[rval] += *v * x1; 1154*49b5e25fSSatish Balay v++; 1155*49b5e25fSSatish Balay } 1156*49b5e25fSSatish Balay xb++; 1157*49b5e25fSSatish Balay } 1158*49b5e25fSSatish Balay break; 1159*49b5e25fSSatish Balay case 2: 1160*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1161*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1162*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 1163*49b5e25fSSatish Balay ib = idx + ai[i]; 1164*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1165*49b5e25fSSatish Balay rval = ib[j]*2; 1166*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2; 1167*49b5e25fSSatish Balay z[rval] += v[2]*x1 + v[3]*x2; 1168*49b5e25fSSatish Balay v += 4; 1169*49b5e25fSSatish Balay } 1170*49b5e25fSSatish Balay xb += 2; 1171*49b5e25fSSatish Balay } 1172*49b5e25fSSatish Balay break; 1173*49b5e25fSSatish Balay case 3: 1174*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1175*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1176*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1177*49b5e25fSSatish Balay ib = idx + ai[i]; 1178*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1179*49b5e25fSSatish Balay rval = ib[j]*3; 1180*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1181*49b5e25fSSatish Balay z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1182*49b5e25fSSatish Balay z[rval] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1183*49b5e25fSSatish Balay v += 9; 1184*49b5e25fSSatish Balay } 1185*49b5e25fSSatish Balay xb += 3; 1186*49b5e25fSSatish Balay } 1187*49b5e25fSSatish Balay break; 1188*49b5e25fSSatish Balay case 4: 1189*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1190*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1191*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1192*49b5e25fSSatish Balay ib = idx + ai[i]; 1193*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1194*49b5e25fSSatish Balay rval = ib[j]*4; 1195*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1196*49b5e25fSSatish Balay z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1197*49b5e25fSSatish Balay z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1198*49b5e25fSSatish Balay z[rval] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1199*49b5e25fSSatish Balay v += 16; 1200*49b5e25fSSatish Balay } 1201*49b5e25fSSatish Balay xb += 4; 1202*49b5e25fSSatish Balay } 1203*49b5e25fSSatish Balay break; 1204*49b5e25fSSatish Balay case 5: 1205*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1206*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1207*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1208*49b5e25fSSatish Balay x4 = xb[3]; x5 = xb[4]; 1209*49b5e25fSSatish Balay ib = idx + ai[i]; 1210*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1211*49b5e25fSSatish Balay rval = ib[j]*5; 1212*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1213*49b5e25fSSatish Balay z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1214*49b5e25fSSatish Balay z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1215*49b5e25fSSatish Balay z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1216*49b5e25fSSatish Balay z[rval] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1217*49b5e25fSSatish Balay v += 25; 1218*49b5e25fSSatish Balay } 1219*49b5e25fSSatish Balay xb += 5; 1220*49b5e25fSSatish Balay } 1221*49b5e25fSSatish Balay break; 1222*49b5e25fSSatish Balay case 6: 1223*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1224*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1225*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1226*49b5e25fSSatish Balay x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1227*49b5e25fSSatish Balay ib = idx + ai[i]; 1228*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1229*49b5e25fSSatish Balay rval = ib[j]*6; 1230*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1231*49b5e25fSSatish Balay z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1232*49b5e25fSSatish Balay z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1233*49b5e25fSSatish Balay z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1234*49b5e25fSSatish Balay z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1235*49b5e25fSSatish Balay z[rval] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1236*49b5e25fSSatish Balay v += 36; 1237*49b5e25fSSatish Balay } 1238*49b5e25fSSatish Balay xb += 6; 1239*49b5e25fSSatish Balay } 1240*49b5e25fSSatish Balay break; 1241*49b5e25fSSatish Balay case 7: 1242*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1243*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1244*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1245*49b5e25fSSatish Balay x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1246*49b5e25fSSatish Balay ib = idx + ai[i]; 1247*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1248*49b5e25fSSatish Balay rval = ib[j]*7; 1249*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1250*49b5e25fSSatish Balay z[rval++] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1251*49b5e25fSSatish Balay z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1252*49b5e25fSSatish Balay z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1253*49b5e25fSSatish Balay z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1254*49b5e25fSSatish Balay z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1255*49b5e25fSSatish Balay z[rval] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1256*49b5e25fSSatish Balay v += 49; 1257*49b5e25fSSatish Balay } 1258*49b5e25fSSatish Balay xb += 7; 1259*49b5e25fSSatish Balay } 1260*49b5e25fSSatish Balay break; 1261*49b5e25fSSatish Balay default: { /* block sizes larger then 7 by 7 are handled by BLAS */ 1262*49b5e25fSSatish Balay int ncols,k; 1263*49b5e25fSSatish Balay Scalar *work,*workt; 1264*49b5e25fSSatish Balay 1265*49b5e25fSSatish Balay if (!a->mult_work) { 1266*49b5e25fSSatish Balay k = PetscMax(a->m,a->n); 1267*49b5e25fSSatish Balay a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work); 1268*49b5e25fSSatish Balay } 1269*49b5e25fSSatish Balay work = a->mult_work; 1270*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1271*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1272*49b5e25fSSatish Balay ncols = n*bs; 1273*49b5e25fSSatish Balay ierr = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr); 1274*49b5e25fSSatish Balay Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work); 1275*49b5e25fSSatish Balay /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */ 1276*49b5e25fSSatish Balay v += n*bs2; 1277*49b5e25fSSatish Balay x += bs; 1278*49b5e25fSSatish Balay workt = work; 1279*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1280*49b5e25fSSatish Balay zb = z + bs*(*idx++); 1281*49b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k] ; 1282*49b5e25fSSatish Balay workt += bs; 1283*49b5e25fSSatish Balay } 1284*49b5e25fSSatish Balay } 1285*49b5e25fSSatish Balay } 1286*49b5e25fSSatish Balay } 1287*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr); 1288*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr); 1289*49b5e25fSSatish Balay PLogFlops(2*a->nz*a->bs2 - a->n); 1290*49b5e25fSSatish Balay PetscFunctionReturn(0); 1291*49b5e25fSSatish Balay } 1292*49b5e25fSSatish Balay 1293*49b5e25fSSatish Balay #undef __FUNC__ 1294*49b5e25fSSatish Balay #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ" 1295*49b5e25fSSatish Balay int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1296*49b5e25fSSatish Balay 1297*49b5e25fSSatish Balay { 1298*49b5e25fSSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1299*49b5e25fSSatish Balay Scalar *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5; 1300*49b5e25fSSatish Balay MatScalar *v; 1301*49b5e25fSSatish Balay int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 1302*49b5e25fSSatish Balay 1303*49b5e25fSSatish Balay PetscFunctionBegin; 1304*49b5e25fSSatish Balay ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg; 1305*49b5e25fSSatish Balay ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg; 1306*49b5e25fSSatish Balay 1307*49b5e25fSSatish Balay if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1308*49b5e25fSSatish Balay 1309*49b5e25fSSatish Balay idx = a->j; 1310*49b5e25fSSatish Balay v = a->a; 1311*49b5e25fSSatish Balay ii = a->i; 1312*49b5e25fSSatish Balay xb = x; 1313*49b5e25fSSatish Balay 1314*49b5e25fSSatish Balay switch (bs) { 1315*49b5e25fSSatish Balay case 1: 1316*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1317*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1318*49b5e25fSSatish Balay x1 = xb[0]; 1319*49b5e25fSSatish Balay ib = idx + ai[i]; 1320*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1321*49b5e25fSSatish Balay rval = ib[j]; 1322*49b5e25fSSatish Balay z[rval] += *v * x1; 1323*49b5e25fSSatish Balay v++; 1324*49b5e25fSSatish Balay } 1325*49b5e25fSSatish Balay xb++; 1326*49b5e25fSSatish Balay } 1327*49b5e25fSSatish Balay break; 1328*49b5e25fSSatish Balay case 2: 1329*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1330*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1331*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; 1332*49b5e25fSSatish Balay ib = idx + ai[i]; 1333*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1334*49b5e25fSSatish Balay rval = ib[j]*2; 1335*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2; 1336*49b5e25fSSatish Balay z[rval++] += v[2]*x1 + v[3]*x2; 1337*49b5e25fSSatish Balay v += 4; 1338*49b5e25fSSatish Balay } 1339*49b5e25fSSatish Balay xb += 2; 1340*49b5e25fSSatish Balay } 1341*49b5e25fSSatish Balay break; 1342*49b5e25fSSatish Balay case 3: 1343*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1344*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1345*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1346*49b5e25fSSatish Balay ib = idx + ai[i]; 1347*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1348*49b5e25fSSatish Balay rval = ib[j]*3; 1349*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1350*49b5e25fSSatish Balay z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1351*49b5e25fSSatish Balay z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1352*49b5e25fSSatish Balay v += 9; 1353*49b5e25fSSatish Balay } 1354*49b5e25fSSatish Balay xb += 3; 1355*49b5e25fSSatish Balay } 1356*49b5e25fSSatish Balay break; 1357*49b5e25fSSatish Balay case 4: 1358*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1359*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1360*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1361*49b5e25fSSatish Balay ib = idx + ai[i]; 1362*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1363*49b5e25fSSatish Balay rval = ib[j]*4; 1364*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1365*49b5e25fSSatish Balay z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1366*49b5e25fSSatish Balay z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1367*49b5e25fSSatish Balay z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1368*49b5e25fSSatish Balay v += 16; 1369*49b5e25fSSatish Balay } 1370*49b5e25fSSatish Balay xb += 4; 1371*49b5e25fSSatish Balay } 1372*49b5e25fSSatish Balay break; 1373*49b5e25fSSatish Balay case 5: 1374*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1375*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1376*49b5e25fSSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1377*49b5e25fSSatish Balay x4 = xb[3]; x5 = xb[4]; 1378*49b5e25fSSatish Balay ib = idx + ai[i]; 1379*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1380*49b5e25fSSatish Balay rval = ib[j]*5; 1381*49b5e25fSSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1382*49b5e25fSSatish Balay z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1383*49b5e25fSSatish Balay z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1384*49b5e25fSSatish Balay z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1385*49b5e25fSSatish Balay z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1386*49b5e25fSSatish Balay v += 25; 1387*49b5e25fSSatish Balay } 1388*49b5e25fSSatish Balay xb += 5; 1389*49b5e25fSSatish Balay } 1390*49b5e25fSSatish Balay break; 1391*49b5e25fSSatish Balay default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1392*49b5e25fSSatish Balay int ncols,k; 1393*49b5e25fSSatish Balay Scalar *work,*workt; 1394*49b5e25fSSatish Balay 1395*49b5e25fSSatish Balay if (!a->mult_work) { 1396*49b5e25fSSatish Balay k = PetscMax(a->m,a->n); 1397*49b5e25fSSatish Balay a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work); 1398*49b5e25fSSatish Balay } 1399*49b5e25fSSatish Balay work = a->mult_work; 1400*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1401*49b5e25fSSatish Balay n = ii[1] - ii[0]; ii++; 1402*49b5e25fSSatish Balay ncols = n*bs; 1403*49b5e25fSSatish Balay ierr = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr); 1404*49b5e25fSSatish Balay Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work); 1405*49b5e25fSSatish Balay /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */ 1406*49b5e25fSSatish Balay v += n*bs2; 1407*49b5e25fSSatish Balay x += bs; 1408*49b5e25fSSatish Balay workt = work; 1409*49b5e25fSSatish Balay for (j=0; j<n; j++) { 1410*49b5e25fSSatish Balay zb = z + bs*(*idx++); 1411*49b5e25fSSatish Balay for (k=0; k<bs; k++) zb[k] += workt[k] ; 1412*49b5e25fSSatish Balay workt += bs; 1413*49b5e25fSSatish Balay } 1414*49b5e25fSSatish Balay } 1415*49b5e25fSSatish Balay } 1416*49b5e25fSSatish Balay } 1417*49b5e25fSSatish Balay ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr); 1418*49b5e25fSSatish Balay ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr); 1419*49b5e25fSSatish Balay PLogFlops(2*a->nz*a->bs2); 1420*49b5e25fSSatish Balay PetscFunctionReturn(0); 1421*49b5e25fSSatish Balay } 1422*49b5e25fSSatish Balay 1423*49b5e25fSSatish Balay #undef __FUNC__ 1424*49b5e25fSSatish Balay #define __FUNC__ "MatScale_SeqSBAIJ" 1425*49b5e25fSSatish Balay int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA) 1426*49b5e25fSSatish Balay { 1427*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1428*49b5e25fSSatish Balay int one = 1,totalnz = a->bs2*a->s_nz; 1429*49b5e25fSSatish Balay 1430*49b5e25fSSatish Balay PetscFunctionBegin; 1431*49b5e25fSSatish Balay BLscal_(&totalnz,alpha,a->a,&one); 1432*49b5e25fSSatish Balay PLogFlops(totalnz); 1433*49b5e25fSSatish Balay PetscFunctionReturn(0); 1434*49b5e25fSSatish Balay } 1435*49b5e25fSSatish Balay 1436*49b5e25fSSatish Balay #undef __FUNC__ 1437*49b5e25fSSatish Balay #define __FUNC__ "MatNorm_SeqSBAIJ" 1438*49b5e25fSSatish Balay int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1439*49b5e25fSSatish Balay { 1440*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1441*49b5e25fSSatish Balay MatScalar *v = a->a; 1442*49b5e25fSSatish Balay PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1443*49b5e25fSSatish Balay int i,j,k,bs = a->bs,nz=a->s_nz,bs2=a->bs2,k1,mbs=a->mbs; 1444*49b5e25fSSatish Balay int *jl,*il,jmin,jmax,ierr,nexti,ik; 1445*49b5e25fSSatish Balay 1446*49b5e25fSSatish Balay PetscFunctionBegin; 1447*49b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 1448*49b5e25fSSatish Balay for (k=0; k<mbs; k++){ 1449*49b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1450*49b5e25fSSatish Balay /* diagonal block */ 1451*49b5e25fSSatish Balay for (i=0; i<bs2; i++){ 1452*49b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX) 1453*49b5e25fSSatish Balay sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1454*49b5e25fSSatish Balay #else 1455*49b5e25fSSatish Balay sum_diag += (*v)*(*v); v++; 1456*49b5e25fSSatish Balay #endif 1457*49b5e25fSSatish Balay } 1458*49b5e25fSSatish Balay for (j=jmin+1; j<jmax; j++){ /* off-diagonal blocks */ 1459*49b5e25fSSatish Balay for (i=0; i<bs2; i++){ 1460*49b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX) 1461*49b5e25fSSatish Balay sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1462*49b5e25fSSatish Balay #else 1463*49b5e25fSSatish Balay sum_off += (*v)*(*v); v++; 1464*49b5e25fSSatish Balay #endif 1465*49b5e25fSSatish Balay } 1466*49b5e25fSSatish Balay } 1467*49b5e25fSSatish Balay } 1468*49b5e25fSSatish Balay *norm = sqrt(sum_diag + 2*sum_off); 1469*49b5e25fSSatish Balay 1470*49b5e25fSSatish Balay } else if (type == NORM_INFINITY) { /* maximum row sum */ 1471*49b5e25fSSatish Balay il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il); 1472*49b5e25fSSatish Balay jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 1473*49b5e25fSSatish Balay sum = (Scalar*)PetscMalloc(bs*sizeof(Scalar));CHKPTRQ(sum); 1474*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { 1475*49b5e25fSSatish Balay jl[i] = mbs; il[0] = 0; 1476*49b5e25fSSatish Balay } 1477*49b5e25fSSatish Balay 1478*49b5e25fSSatish Balay *norm = 0.0; 1479*49b5e25fSSatish Balay for (k=0; k<mbs; k++) { /* k_th block row */ 1480*49b5e25fSSatish Balay for (j=0; j<bs; j++) sum[j]=0.0; 1481*49b5e25fSSatish Balay 1482*49b5e25fSSatish Balay /*-- col sum --*/ 1483*49b5e25fSSatish Balay i = jl[k]; /* first |A(i,k)| to be added */ 1484*49b5e25fSSatish Balay while (i<mbs){ 1485*49b5e25fSSatish Balay nexti = jl[i]; /* next block row to be added */ 1486*49b5e25fSSatish Balay ik = il[i]; /* block index of A(i,k) in the array a */ 1487*49b5e25fSSatish Balay for (j=0; j<bs; j++){ 1488*49b5e25fSSatish Balay v = a->a + ik*bs2 + j*bs; 1489*49b5e25fSSatish Balay for (k1=0; k1<bs; k1++) { 1490*49b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); v++; 1491*49b5e25fSSatish Balay } 1492*49b5e25fSSatish Balay } 1493*49b5e25fSSatish Balay /* update il, jl */ 1494*49b5e25fSSatish Balay jmin = ik + 1; jmax = a->i[i+1]; 1495*49b5e25fSSatish Balay if (jmin < jmax){ 1496*49b5e25fSSatish Balay il[i] = jmin; 1497*49b5e25fSSatish Balay j = a->j[jmin]; 1498*49b5e25fSSatish Balay jl[i] = jl[j]; jl[j]=i; 1499*49b5e25fSSatish Balay } 1500*49b5e25fSSatish Balay i = nexti; 1501*49b5e25fSSatish Balay } 1502*49b5e25fSSatish Balay 1503*49b5e25fSSatish Balay /*-- row sum --*/ 1504*49b5e25fSSatish Balay jmin = a->i[k]; jmax = a->i[k+1]; 1505*49b5e25fSSatish Balay for (i=jmin; i<jmax; i++) { 1506*49b5e25fSSatish Balay for (j=0; j<bs; j++){ 1507*49b5e25fSSatish Balay v = a->a + i*bs2 + j; 1508*49b5e25fSSatish Balay for (k1=0; k1<bs; k1++){ 1509*49b5e25fSSatish Balay sum[j] += PetscAbsScalar(*v); 1510*49b5e25fSSatish Balay v += bs; 1511*49b5e25fSSatish Balay } 1512*49b5e25fSSatish Balay } 1513*49b5e25fSSatish Balay } 1514*49b5e25fSSatish Balay /* add k_th block row to il, jl */ 1515*49b5e25fSSatish Balay jmin++; 1516*49b5e25fSSatish Balay if (jmin < jmax){ 1517*49b5e25fSSatish Balay il[k] = jmin; 1518*49b5e25fSSatish Balay j = a->j[jmin]; 1519*49b5e25fSSatish Balay jl[k] = jl[j]; jl[j] = k; 1520*49b5e25fSSatish Balay } 1521*49b5e25fSSatish Balay for (j=0; j<bs; j++){ 1522*49b5e25fSSatish Balay if (sum[j] > *norm) *norm = sum[j]; 1523*49b5e25fSSatish Balay } 1524*49b5e25fSSatish Balay } 1525*49b5e25fSSatish Balay ierr = PetscFree(il);CHKERRQ(ierr); 1526*49b5e25fSSatish Balay ierr = PetscFree(jl);CHKERRQ(ierr); 1527*49b5e25fSSatish Balay ierr = PetscFree(sum);CHKERRQ(ierr); 1528*49b5e25fSSatish Balay } else { 1529*49b5e25fSSatish Balay SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet"); 1530*49b5e25fSSatish Balay } 1531*49b5e25fSSatish Balay PetscFunctionReturn(0); 1532*49b5e25fSSatish Balay } 1533*49b5e25fSSatish Balay 1534*49b5e25fSSatish Balay #ifdef MatNorm_SeqBAIJ 1535*49b5e25fSSatish Balay /* This is modified MatNorm_SeqBAIJ. 1536*49b5e25fSSatish Balay MatNorm_SeqBAIJ in baij/seq/baij2.c is not correct for bs>1 */ 1537*49b5e25fSSatish Balay 1538*49b5e25fSSatish Balay #undef __FUNC__ 1539*49b5e25fSSatish Balay #define __FUNC__ "MatNorm_SeqBAIJ" 1540*49b5e25fSSatish Balay int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1541*49b5e25fSSatish Balay { 1542*49b5e25fSSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1543*49b5e25fSSatish Balay MatScalar *v = a->a; 1544*49b5e25fSSatish Balay PetscReal sum = 0.0; 1545*49b5e25fSSatish Balay int i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1; 1546*49b5e25fSSatish Balay 1547*49b5e25fSSatish Balay PetscFunctionBegin; 1548*49b5e25fSSatish Balay if (type == NORM_FROBENIUS) { 1549*49b5e25fSSatish Balay for (i=0; i< bs2*nz; i++) { 1550*49b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX) 1551*49b5e25fSSatish Balay sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1552*49b5e25fSSatish Balay #else 1553*49b5e25fSSatish Balay sum += (*v)*(*v); v++; 1554*49b5e25fSSatish Balay #endif 1555*49b5e25fSSatish Balay } 1556*49b5e25fSSatish Balay *norm = sqrt(sum); 1557*49b5e25fSSatish Balay } else if (type == NORM_INFINITY) { /* maximum row sum */ 1558*49b5e25fSSatish Balay *norm = 0.0; 1559*49b5e25fSSatish Balay for (j=0; j<a->mbs; j++) { 1560*49b5e25fSSatish Balay for (k=0; k<bs; k++) { 1561*49b5e25fSSatish Balay v = a->a + bs2*a->i[j] + k; 1562*49b5e25fSSatish Balay sum = 0.0; 1563*49b5e25fSSatish Balay for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1564*49b5e25fSSatish Balay for (k1=0; k1<bs; k1++){ /* this loop was missing in the original code*/ 1565*49b5e25fSSatish Balay sum += PetscAbsScalar(*v); 1566*49b5e25fSSatish Balay v += bs; 1567*49b5e25fSSatish Balay } 1568*49b5e25fSSatish Balay } 1569*49b5e25fSSatish Balay if (sum > *norm) *norm = sum; 1570*49b5e25fSSatish Balay } 1571*49b5e25fSSatish Balay } 1572*49b5e25fSSatish Balay } else { 1573*49b5e25fSSatish Balay SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet"); 1574*49b5e25fSSatish Balay } 1575*49b5e25fSSatish Balay PetscFunctionReturn(0); 1576*49b5e25fSSatish Balay } 1577*49b5e25fSSatish Balay #endif 1578*49b5e25fSSatish Balay 1579*49b5e25fSSatish Balay #undef __FUNC__ 1580*49b5e25fSSatish Balay #define __FUNC__ "MatEqual_SeqSBAIJ" 1581*49b5e25fSSatish Balay int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1582*49b5e25fSSatish Balay { 1583*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1584*49b5e25fSSatish Balay int ierr; 1585*49b5e25fSSatish Balay 1586*49b5e25fSSatish Balay PetscFunctionBegin; 1587*49b5e25fSSatish Balay if (B->type !=MATSEQSBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type"); 1588*49b5e25fSSatish Balay 1589*49b5e25fSSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1590*49b5e25fSSatish Balay if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) { 1591*49b5e25fSSatish Balay *flg = PETSC_FALSE; PetscFunctionReturn(0); 1592*49b5e25fSSatish Balay } 1593*49b5e25fSSatish Balay 1594*49b5e25fSSatish Balay /* if the a->i are the same */ 1595*49b5e25fSSatish Balay ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1596*49b5e25fSSatish Balay if (*flg == PETSC_FALSE) { 1597*49b5e25fSSatish Balay PetscFunctionReturn(0); 1598*49b5e25fSSatish Balay } 1599*49b5e25fSSatish Balay 1600*49b5e25fSSatish Balay /* if a->j are the same */ 1601*49b5e25fSSatish Balay ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr); 1602*49b5e25fSSatish Balay if (*flg == PETSC_FALSE) { 1603*49b5e25fSSatish Balay PetscFunctionReturn(0); 1604*49b5e25fSSatish Balay } 1605*49b5e25fSSatish Balay /* if a->a are the same */ 1606*49b5e25fSSatish Balay ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr); 1607*49b5e25fSSatish Balay PetscFunctionReturn(0); 1608*49b5e25fSSatish Balay 1609*49b5e25fSSatish Balay } 1610*49b5e25fSSatish Balay 1611*49b5e25fSSatish Balay #undef __FUNC__ 1612*49b5e25fSSatish Balay #define __FUNC__ "MatGetDiagonal_SeqSBAIJ" 1613*49b5e25fSSatish Balay int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1614*49b5e25fSSatish Balay { 1615*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1616*49b5e25fSSatish Balay int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1617*49b5e25fSSatish Balay Scalar *x,zero = 0.0; 1618*49b5e25fSSatish Balay MatScalar *aa,*aa_j; 1619*49b5e25fSSatish Balay 1620*49b5e25fSSatish Balay PetscFunctionBegin; 1621*49b5e25fSSatish Balay if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix"); 1622*49b5e25fSSatish Balay bs = a->bs; 1623*49b5e25fSSatish Balay aa = a->a; 1624*49b5e25fSSatish Balay ai = a->i; 1625*49b5e25fSSatish Balay aj = a->j; 1626*49b5e25fSSatish Balay ambs = a->mbs; 1627*49b5e25fSSatish Balay bs2 = a->bs2; 1628*49b5e25fSSatish Balay 1629*49b5e25fSSatish Balay ierr = VecSet(&zero,v);CHKERRQ(ierr); 1630*49b5e25fSSatish Balay ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1631*49b5e25fSSatish Balay ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1632*49b5e25fSSatish Balay /* if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");*/ 1633*49b5e25fSSatish Balay for (i=0; i<ambs; i++) { 1634*49b5e25fSSatish Balay j=ai[i]; 1635*49b5e25fSSatish Balay if (aj[j] == i) { /* if this is a diagonal element */ 1636*49b5e25fSSatish Balay row = i*bs; 1637*49b5e25fSSatish Balay aa_j = aa + j*bs2; 1638*49b5e25fSSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1639*49b5e25fSSatish Balay } 1640*49b5e25fSSatish Balay } 1641*49b5e25fSSatish Balay ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1642*49b5e25fSSatish Balay PetscFunctionReturn(0); 1643*49b5e25fSSatish Balay } 1644*49b5e25fSSatish Balay 1645*49b5e25fSSatish Balay #undef __FUNC__ 1646*49b5e25fSSatish Balay #define __FUNC__ "MatDiagonalScale_SeqSBAIJ" 1647*49b5e25fSSatish Balay int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1648*49b5e25fSSatish Balay { 1649*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1650*49b5e25fSSatish Balay Scalar *l,*r,x,*li,*ri; 1651*49b5e25fSSatish Balay MatScalar *aa,*v; 1652*49b5e25fSSatish Balay int ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 1653*49b5e25fSSatish Balay 1654*49b5e25fSSatish Balay PetscFunctionBegin; 1655*49b5e25fSSatish Balay ai = a->i; 1656*49b5e25fSSatish Balay aj = a->j; 1657*49b5e25fSSatish Balay aa = a->a; 1658*49b5e25fSSatish Balay m = a->m; 1659*49b5e25fSSatish Balay n = a->n; 1660*49b5e25fSSatish Balay bs = a->bs; 1661*49b5e25fSSatish Balay mbs = a->mbs; 1662*49b5e25fSSatish Balay bs2 = a->bs2; 1663*49b5e25fSSatish Balay 1664*49b5e25fSSatish Balay if (ll != rr) { 1665*49b5e25fSSatish Balay SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"For symmetric format, left and right scaling vectors must be same\n"); 1666*49b5e25fSSatish Balay } 1667*49b5e25fSSatish Balay if (ll) { 1668*49b5e25fSSatish Balay ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1669*49b5e25fSSatish Balay ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1670*49b5e25fSSatish Balay if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length"); 1671*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 1672*49b5e25fSSatish Balay M = ai[i+1] - ai[i]; 1673*49b5e25fSSatish Balay li = l + i*bs; 1674*49b5e25fSSatish Balay v = aa + bs2*ai[i]; 1675*49b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 1676*49b5e25fSSatish Balay for (k=0; k<bs2; k++) { 1677*49b5e25fSSatish Balay (*v++) *= li[k%bs]; 1678*49b5e25fSSatish Balay } 1679*49b5e25fSSatish Balay #ifdef CONT 1680*49b5e25fSSatish Balay /* will be used to replace the above loop */ 1681*49b5e25fSSatish Balay ri = l + bs*aj[ai[i]+j]; 1682*49b5e25fSSatish Balay for (k=0; k<bs; k++) { /* column value */ 1683*49b5e25fSSatish Balay x = ri[k]; 1684*49b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1685*49b5e25fSSatish Balay } 1686*49b5e25fSSatish Balay #endif 1687*49b5e25fSSatish Balay 1688*49b5e25fSSatish Balay } 1689*49b5e25fSSatish Balay } 1690*49b5e25fSSatish Balay ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1691*49b5e25fSSatish Balay PLogFlops(2*a->s_nz); 1692*49b5e25fSSatish Balay } 1693*49b5e25fSSatish Balay /* will be deleted */ 1694*49b5e25fSSatish Balay if (rr) { 1695*49b5e25fSSatish Balay ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 1696*49b5e25fSSatish Balay ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1697*49b5e25fSSatish Balay if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length"); 1698*49b5e25fSSatish Balay for (i=0; i<mbs; i++) { /* for each block row */ 1699*49b5e25fSSatish Balay M = ai[i+1] - ai[i]; 1700*49b5e25fSSatish Balay v = aa + bs2*ai[i]; 1701*49b5e25fSSatish Balay for (j=0; j<M; j++) { /* for each block */ 1702*49b5e25fSSatish Balay ri = r + bs*aj[ai[i]+j]; 1703*49b5e25fSSatish Balay for (k=0; k<bs; k++) { 1704*49b5e25fSSatish Balay x = ri[k]; 1705*49b5e25fSSatish Balay for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1706*49b5e25fSSatish Balay } 1707*49b5e25fSSatish Balay } 1708*49b5e25fSSatish Balay } 1709*49b5e25fSSatish Balay ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 1710*49b5e25fSSatish Balay PLogFlops(a->s_nz); 1711*49b5e25fSSatish Balay } 1712*49b5e25fSSatish Balay PetscFunctionReturn(0); 1713*49b5e25fSSatish Balay } 1714*49b5e25fSSatish Balay 1715*49b5e25fSSatish Balay #undef __FUNC__ 1716*49b5e25fSSatish Balay #define __FUNC__ "MatGetInfo_SeqSBAIJ" 1717*49b5e25fSSatish Balay int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1718*49b5e25fSSatish Balay { 1719*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1720*49b5e25fSSatish Balay 1721*49b5e25fSSatish Balay PetscFunctionBegin; 1722*49b5e25fSSatish Balay info->rows_global = (double)a->m; 1723*49b5e25fSSatish Balay info->columns_global = (double)a->m; 1724*49b5e25fSSatish Balay info->rows_local = (double)a->m; 1725*49b5e25fSSatish Balay info->columns_local = (double)a->m; 1726*49b5e25fSSatish Balay info->block_size = a->bs2; 1727*49b5e25fSSatish Balay info->nz_allocated = a->s_maxnz; /*num. of nonzeros in upper triangular part */ 1728*49b5e25fSSatish Balay info->nz_used = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */ 1729*49b5e25fSSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1730*49b5e25fSSatish Balay info->assemblies = A->num_ass; 1731*49b5e25fSSatish Balay info->mallocs = a->reallocs; 1732*49b5e25fSSatish Balay info->memory = A->mem; 1733*49b5e25fSSatish Balay if (A->factor) { 1734*49b5e25fSSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 1735*49b5e25fSSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 1736*49b5e25fSSatish Balay info->factor_mallocs = A->info.factor_mallocs; 1737*49b5e25fSSatish Balay } else { 1738*49b5e25fSSatish Balay info->fill_ratio_given = 0; 1739*49b5e25fSSatish Balay info->fill_ratio_needed = 0; 1740*49b5e25fSSatish Balay info->factor_mallocs = 0; 1741*49b5e25fSSatish Balay } 1742*49b5e25fSSatish Balay PetscFunctionReturn(0); 1743*49b5e25fSSatish Balay } 1744*49b5e25fSSatish Balay 1745*49b5e25fSSatish Balay 1746*49b5e25fSSatish Balay #undef __FUNC__ 1747*49b5e25fSSatish Balay #define __FUNC__ "MatZeroEntries_SeqSBAIJ" 1748*49b5e25fSSatish Balay int MatZeroEntries_SeqSBAIJ(Mat A) 1749*49b5e25fSSatish Balay { 1750*49b5e25fSSatish Balay Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1751*49b5e25fSSatish Balay int ierr; 1752*49b5e25fSSatish Balay 1753*49b5e25fSSatish Balay PetscFunctionBegin; 1754*49b5e25fSSatish Balay ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1755*49b5e25fSSatish Balay PetscFunctionReturn(0); 1756*49b5e25fSSatish Balay } 1757