1a5eb4965SSatish Balay #ifdef PETSC_RCS_HEADER 2*2d61bbb3SSatish Balay static char vcid[] = "$Id: baij2.c,v 1.20 1997/12/01 01:55:02 bsmith Exp balay $"; 3cac129eeSSatish Balay #endif 4cac129eeSSatish Balay 5*2d61bbb3SSatish Balay #include "pinclude/pviewer.h" 6*2d61bbb3SSatish Balay #include "sys.h" 770f55243SBarry Smith #include "src/mat/impls/baij/seq/baij.h" 8*2d61bbb3SSatish Balay #include "src/vec/vecimpl.h" 9*2d61bbb3SSatish Balay #include "src/inline/spops.h" 10cac129eeSSatish Balay #include "petsc.h" 11736121d4SSatish Balay #include "src/inline/bitarray.h" 12cac129eeSSatish Balay 13*2d61bbb3SSatish Balay 145615d1e5SSatish Balay #undef __FUNC__ 15d4bb536fSBarry Smith #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ" 16736121d4SSatish Balay int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov) 17a3192f15SSatish Balay { 18a3192f15SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 19218c64b6SSatish Balay int row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival; 20218c64b6SSatish Balay int start, end, *ai, *aj,bs,*nidx2; 2142523b56SSatish Balay BT table; 22a3192f15SSatish Balay 233a40ed3dSBarry Smith PetscFunctionBegin; 24a3192f15SSatish Balay m = a->mbs; 25a3192f15SSatish Balay ai = a->i; 26a3192f15SSatish Balay aj = a->j; 27218c64b6SSatish Balay bs = a->bs; 28a3192f15SSatish Balay 29a8c6a408SBarry Smith if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified"); 30a3192f15SSatish Balay 3142523b56SSatish Balay ierr = BTCreate(m,table); CHKERRQ(ierr); 32a3192f15SSatish Balay nidx = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx); 33218c64b6SSatish Balay nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2); 34a3192f15SSatish Balay 35a3192f15SSatish Balay for ( i=0; i<is_max; i++ ) { 36a3192f15SSatish Balay /* Initialise the two local arrays */ 37a3192f15SSatish Balay isz = 0; 3842523b56SSatish Balay BTMemzero(m,table); 39a3192f15SSatish Balay 40a3192f15SSatish Balay /* Extract the indices, assume there can be duplicate entries */ 41a3192f15SSatish Balay ierr = ISGetIndices(is[i],&idx); CHKERRQ(ierr); 42a3192f15SSatish Balay ierr = ISGetSize(is[i],&n); CHKERRQ(ierr); 43a3192f15SSatish Balay 44a3192f15SSatish Balay /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 45a3192f15SSatish Balay for ( j=0; j<n ; ++j){ 46218c64b6SSatish Balay ival = idx[j]/bs; /* convert the indices into block indices */ 47a8c6a408SBarry Smith if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim"); 4842523b56SSatish Balay if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;} 49a3192f15SSatish Balay } 50a3192f15SSatish Balay ierr = ISRestoreIndices(is[i],&idx); CHKERRQ(ierr); 51a3192f15SSatish Balay ierr = ISDestroy(is[i]); CHKERRQ(ierr); 52a3192f15SSatish Balay 53a3192f15SSatish Balay k = 0; 54a3192f15SSatish Balay for ( j=0; j<ov; j++){ /* for each overlap*/ 55a3192f15SSatish Balay n = isz; 56a3192f15SSatish Balay for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 57a3192f15SSatish Balay row = nidx[k]; 58a3192f15SSatish Balay start = ai[row]; 59a3192f15SSatish Balay end = ai[row+1]; 60a3192f15SSatish Balay for ( l = start; l<end ; l++){ 61a3192f15SSatish Balay val = aj[l]; 6242523b56SSatish Balay if (!BTLookupSet(table,val)) {nidx[isz++] = val;} 63a3192f15SSatish Balay } 64a3192f15SSatish Balay } 65a3192f15SSatish Balay } 66218c64b6SSatish Balay /* expand the Index Set */ 67218c64b6SSatish Balay for (j=0; j<isz; j++ ) { 68218c64b6SSatish Balay for (k=0; k<bs; k++ ) 69218c64b6SSatish Balay nidx2[j*bs+k] = nidx[j]*bs+k; 70218c64b6SSatish Balay } 71029af93fSBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr); 72a3192f15SSatish Balay } 7342523b56SSatish Balay BTDestroy(table); 74a3192f15SSatish Balay PetscFree(nidx); 75218c64b6SSatish Balay PetscFree(nidx2); 763a40ed3dSBarry Smith PetscFunctionReturn(0); 77a3192f15SSatish Balay } 781c351548SSatish Balay 795615d1e5SSatish Balay #undef __FUNC__ 805615d1e5SSatish Balay #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private" 81218c64b6SSatish Balay int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatGetSubMatrixCall scall,Mat *B) 82736121d4SSatish Balay { 83736121d4SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*c; 84736121d4SSatish Balay int nznew, *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens; 85218c64b6SSatish Balay int row,mat_i,*mat_j,tcol,*mat_ilen; 86736121d4SSatish Balay int *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2; 87218c64b6SSatish Balay int *aj = a->j, *ai = a->i; 88218c64b6SSatish Balay Scalar *mat_a; 89736121d4SSatish Balay Mat C; 90736121d4SSatish Balay 913a40ed3dSBarry Smith PetscFunctionBegin; 92736121d4SSatish Balay ierr = ISSorted(iscol,(PetscTruth*)&i); 93a8c6a408SBarry Smith if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted"); 94736121d4SSatish Balay 95736121d4SSatish Balay ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr); 96218c64b6SSatish Balay ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr); 97736121d4SSatish Balay ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr); 98736121d4SSatish Balay ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr); 99736121d4SSatish Balay 100736121d4SSatish Balay smap = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap); 101736121d4SSatish Balay ssmap = smap; 102736121d4SSatish Balay lens = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens); 103736121d4SSatish Balay PetscMemzero(smap,oldcols*sizeof(int)); 104736121d4SSatish Balay for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1; 105736121d4SSatish Balay /* determine lens of each row */ 106736121d4SSatish Balay for (i=0; i<nrows; i++) { 107736121d4SSatish Balay kstart = ai[irow[i]]; 108736121d4SSatish Balay kend = kstart + a->ilen[irow[i]]; 109736121d4SSatish Balay lens[i] = 0; 110736121d4SSatish Balay for ( k=kstart; k<kend; k++ ) { 111736121d4SSatish Balay if (ssmap[aj[k]]) { 112736121d4SSatish Balay lens[i]++; 113736121d4SSatish Balay } 114736121d4SSatish Balay } 115736121d4SSatish Balay } 116736121d4SSatish Balay /* Create and fill new matrix */ 117736121d4SSatish Balay if (scall == MAT_REUSE_MATRIX) { 118736121d4SSatish Balay c = (Mat_SeqBAIJ *)((*B)->data); 119736121d4SSatish Balay 120a8c6a408SBarry Smith if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size"); 121736121d4SSatish Balay if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) { 122a8c6a408SBarry Smith SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros"); 123736121d4SSatish Balay } 124736121d4SSatish Balay PetscMemzero(c->ilen,c->mbs*sizeof(int)); 125736121d4SSatish Balay C = *B; 1263a40ed3dSBarry Smith } else { 127736121d4SSatish Balay ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr); 128736121d4SSatish Balay } 129736121d4SSatish Balay c = (Mat_SeqBAIJ *)(C->data); 130736121d4SSatish Balay for (i=0; i<nrows; i++) { 131736121d4SSatish Balay row = irow[i]; 132736121d4SSatish Balay nznew = 0; 133736121d4SSatish Balay kstart = ai[row]; 134736121d4SSatish Balay kend = kstart + a->ilen[row]; 135736121d4SSatish Balay mat_i = c->i[i]; 136736121d4SSatish Balay mat_j = c->j + mat_i; 137218c64b6SSatish Balay mat_a = c->a + mat_i*bs2; 138736121d4SSatish Balay mat_ilen = c->ilen + i; 139736121d4SSatish Balay for ( k=kstart; k<kend; k++ ) { 140736121d4SSatish Balay if ((tcol=ssmap[a->j[k]])) { 141736121d4SSatish Balay *mat_j++ = tcol - 1; 142736121d4SSatish Balay PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2; 143736121d4SSatish Balay (*mat_ilen)++; 144736121d4SSatish Balay 145736121d4SSatish Balay } 146736121d4SSatish Balay } 147736121d4SSatish Balay } 148218c64b6SSatish Balay 149736121d4SSatish Balay /* Free work space */ 150736121d4SSatish Balay ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr); 151736121d4SSatish Balay PetscFree(smap); PetscFree(lens); 1526d4a8577SBarry Smith ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 1536d4a8577SBarry Smith ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 154736121d4SSatish Balay 155736121d4SSatish Balay ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr); 156736121d4SSatish Balay *B = C; 1573a40ed3dSBarry Smith PetscFunctionReturn(0); 158736121d4SSatish Balay } 159736121d4SSatish Balay 1605615d1e5SSatish Balay #undef __FUNC__ 1615615d1e5SSatish Balay #define __FUNC__ "MatGetSubMatrix_SeqBAIJ" 162218c64b6SSatish Balay int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatGetSubMatrixCall scall,Mat *B) 163218c64b6SSatish Balay { 164218c64b6SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 165218c64b6SSatish Balay IS is1,is2; 166218c64b6SSatish Balay int *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count; 167218c64b6SSatish Balay 1683a40ed3dSBarry Smith PetscFunctionBegin; 169218c64b6SSatish Balay ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr); 170218c64b6SSatish Balay ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr); 171218c64b6SSatish Balay ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr); 172218c64b6SSatish Balay ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr); 173218c64b6SSatish Balay 174218c64b6SSatish Balay /* Verify if the indices corespond to each elementin a block 175218c64b6SSatish Balay and form the IS with compressed IS */ 176218c64b6SSatish Balay vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary); 177218c64b6SSatish Balay iary = vary + a->mbs; 178218c64b6SSatish Balay PetscMemzero(vary,(a->mbs)*sizeof(int)); 179218c64b6SSatish Balay for ( i=0; i<nrows; i++) vary[irow[i]/bs]++; 180218c64b6SSatish Balay count = 0; 181218c64b6SSatish Balay for (i=0; i<a->mbs; i++) { 182e3372554SBarry Smith if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:"); 183218c64b6SSatish Balay if (vary[i]==bs) iary[count++] = i; 184218c64b6SSatish Balay } 185029af93fSBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr); 186218c64b6SSatish Balay 187218c64b6SSatish Balay PetscMemzero(vary,(a->mbs)*sizeof(int)); 188218c64b6SSatish Balay for ( i=0; i<ncols; i++) vary[icol[i]/bs]++; 189218c64b6SSatish Balay count = 0; 190218c64b6SSatish Balay for (i=0; i<a->mbs; i++) { 191e3372554SBarry Smith if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:"); 192218c64b6SSatish Balay if (vary[i]==bs) iary[count++] = i; 193218c64b6SSatish Balay } 194029af93fSBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr); 195218c64b6SSatish Balay ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr); 196218c64b6SSatish Balay ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr); 197218c64b6SSatish Balay PetscFree(vary); 198218c64b6SSatish Balay 199218c64b6SSatish Balay ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B); CHKERRQ(ierr); 200218c64b6SSatish Balay ISDestroy(is1); 201218c64b6SSatish Balay ISDestroy(is2); 2023a40ed3dSBarry Smith PetscFunctionReturn(0); 203218c64b6SSatish Balay } 204218c64b6SSatish Balay 205905e6a2fSBarry Smith extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*); 206905e6a2fSBarry Smith 2075615d1e5SSatish Balay #undef __FUNC__ 2085615d1e5SSatish Balay #define __FUNC__ "MatGetSubMatrices_SeqBAIJ" 209736121d4SSatish Balay int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall, 210736121d4SSatish Balay Mat **B) 211736121d4SSatish Balay { 212736121d4SSatish Balay int ierr,i; 213736121d4SSatish Balay 2143a40ed3dSBarry Smith PetscFunctionBegin; 215736121d4SSatish Balay if (scall == MAT_INITIAL_MATRIX) { 216736121d4SSatish Balay *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B); 217736121d4SSatish Balay } 218736121d4SSatish Balay 219736121d4SSatish Balay for ( i=0; i<n; i++ ) { 220905e6a2fSBarry Smith ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 221736121d4SSatish Balay } 2223a40ed3dSBarry Smith PetscFunctionReturn(0); 223736121d4SSatish Balay } 224218c64b6SSatish Balay 225218c64b6SSatish Balay 226*2d61bbb3SSatish Balay /* -------------------------------------------------------*/ 227*2d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */ 228*2d61bbb3SSatish Balay /* -------------------------------------------------------*/ 229*2d61bbb3SSatish Balay #include "pinclude/plapack.h" 230*2d61bbb3SSatish Balay 231*2d61bbb3SSatish Balay #undef __FUNC__ 232*2d61bbb3SSatish Balay #define __FUNC__ "MatMult_SeqBAIJ_1" 233*2d61bbb3SSatish Balay int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 234*2d61bbb3SSatish Balay { 235*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 236*2d61bbb3SSatish Balay register Scalar *x,*z,*v,sum; 237*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,n; 238*2d61bbb3SSatish Balay 239*2d61bbb3SSatish Balay PetscFunctionBegin; 240*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 241*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 242*2d61bbb3SSatish Balay 243*2d61bbb3SSatish Balay idx = a->j; 244*2d61bbb3SSatish Balay v = a->a; 245*2d61bbb3SSatish Balay ii = a->i; 246*2d61bbb3SSatish Balay 247*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 248*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 249*2d61bbb3SSatish Balay sum = 0.0; 250*2d61bbb3SSatish Balay while (n--) sum += *v++ * x[*idx++]; 251*2d61bbb3SSatish Balay z[i] = sum; 252*2d61bbb3SSatish Balay } 253*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 254*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 255*2d61bbb3SSatish Balay PLogFlops(2*a->nz - a->m); 256*2d61bbb3SSatish Balay PetscFunctionReturn(0); 257*2d61bbb3SSatish Balay } 258*2d61bbb3SSatish Balay 259*2d61bbb3SSatish Balay #undef __FUNC__ 260*2d61bbb3SSatish Balay #define __FUNC__ "MatMult_SeqBAIJ_2" 261*2d61bbb3SSatish Balay int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 262*2d61bbb3SSatish Balay { 263*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 264*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb,sum1,sum2; 265*2d61bbb3SSatish Balay register Scalar x1,x2; 266*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 267*2d61bbb3SSatish Balay 268*2d61bbb3SSatish Balay PetscFunctionBegin; 269*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 270*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 271*2d61bbb3SSatish Balay 272*2d61bbb3SSatish Balay idx = a->j; 273*2d61bbb3SSatish Balay v = a->a; 274*2d61bbb3SSatish Balay ii = a->i; 275*2d61bbb3SSatish Balay 276*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 277*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 278*2d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; 279*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 280*2d61bbb3SSatish Balay xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 281*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[2]*x2; 282*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[3]*x2; 283*2d61bbb3SSatish Balay v += 4; 284*2d61bbb3SSatish Balay } 285*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; 286*2d61bbb3SSatish Balay z += 2; 287*2d61bbb3SSatish Balay } 288*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 289*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 290*2d61bbb3SSatish Balay PLogFlops(4*a->nz - a->m); 291*2d61bbb3SSatish Balay PetscFunctionReturn(0); 292*2d61bbb3SSatish Balay } 293*2d61bbb3SSatish Balay 294*2d61bbb3SSatish Balay #undef __FUNC__ 295*2d61bbb3SSatish Balay #define __FUNC__ "MatMult_SeqBAIJ_3" 296*2d61bbb3SSatish Balay int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 297*2d61bbb3SSatish Balay { 298*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 299*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3; 300*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 301*2d61bbb3SSatish Balay 302*2d61bbb3SSatish Balay PetscFunctionBegin; 303*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 304*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 305*2d61bbb3SSatish Balay 306*2d61bbb3SSatish Balay idx = a->j; 307*2d61bbb3SSatish Balay v = a->a; 308*2d61bbb3SSatish Balay ii = a->i; 309*2d61bbb3SSatish Balay 310*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 311*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 312*2d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 313*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 314*2d61bbb3SSatish Balay xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 315*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 316*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 317*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 318*2d61bbb3SSatish Balay v += 9; 319*2d61bbb3SSatish Balay } 320*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; 321*2d61bbb3SSatish Balay z += 3; 322*2d61bbb3SSatish Balay } 323*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 324*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 325*2d61bbb3SSatish Balay PLogFlops(18*a->nz - a->m); 326*2d61bbb3SSatish Balay PetscFunctionReturn(0); 327*2d61bbb3SSatish Balay } 328*2d61bbb3SSatish Balay 329*2d61bbb3SSatish Balay #undef __FUNC__ 330*2d61bbb3SSatish Balay #define __FUNC__ "MatMult_SeqBAIJ_4" 331*2d61bbb3SSatish Balay int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 332*2d61bbb3SSatish Balay { 333*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 334*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4; 335*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 336*2d61bbb3SSatish Balay 337*2d61bbb3SSatish Balay PetscFunctionBegin; 338*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 339*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 340*2d61bbb3SSatish Balay 341*2d61bbb3SSatish Balay idx = a->j; 342*2d61bbb3SSatish Balay v = a->a; 343*2d61bbb3SSatish Balay ii = a->i; 344*2d61bbb3SSatish Balay 345*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 346*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 347*2d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 348*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 349*2d61bbb3SSatish Balay xb = x + 4*(*idx++); 350*2d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 351*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 352*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 353*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 354*2d61bbb3SSatish Balay sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 355*2d61bbb3SSatish Balay v += 16; 356*2d61bbb3SSatish Balay } 357*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 358*2d61bbb3SSatish Balay z += 4; 359*2d61bbb3SSatish Balay } 360*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 361*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 362*2d61bbb3SSatish Balay PLogFlops(32*a->nz - a->m); 363*2d61bbb3SSatish Balay PetscFunctionReturn(0); 364*2d61bbb3SSatish Balay } 365*2d61bbb3SSatish Balay 366*2d61bbb3SSatish Balay #undef __FUNC__ 367*2d61bbb3SSatish Balay #define __FUNC__ "MatMult_SeqBAIJ_5" 368*2d61bbb3SSatish Balay int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 369*2d61bbb3SSatish Balay { 370*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 371*2d61bbb3SSatish Balay register Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 372*2d61bbb3SSatish Balay register Scalar * restrict v,* restrict xb,* restrict z, * restrict x; 373*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 374*2d61bbb3SSatish Balay 375*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 376*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 377*2d61bbb3SSatish Balay 378*2d61bbb3SSatish Balay idx = a->j; 379*2d61bbb3SSatish Balay v = a->a; 380*2d61bbb3SSatish Balay ii = a->i; 381*2d61bbb3SSatish Balay 382*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 383*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 384*2d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 385*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 386*2d61bbb3SSatish Balay xb = x + 5*(*idx++); 387*2d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 388*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 389*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 390*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 391*2d61bbb3SSatish Balay sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 392*2d61bbb3SSatish Balay sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 393*2d61bbb3SSatish Balay v += 25; 394*2d61bbb3SSatish Balay } 395*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 396*2d61bbb3SSatish Balay z += 5; 397*2d61bbb3SSatish Balay } 398*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 399*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 400*2d61bbb3SSatish Balay PLogFlops(50*a->nz - a->m); 401*2d61bbb3SSatish Balay PetscFunctionReturn(0); 402*2d61bbb3SSatish Balay } 403*2d61bbb3SSatish Balay 404*2d61bbb3SSatish Balay #undef __FUNC__ 405*2d61bbb3SSatish Balay #define __FUNC__ "MatMult_SeqBAIJ_7" 406*2d61bbb3SSatish Balay int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 407*2d61bbb3SSatish Balay { 408*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 409*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 410*2d61bbb3SSatish Balay register Scalar x1,x2,x3,x4,x5,x6,x7; 411*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 412*2d61bbb3SSatish Balay 413*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 414*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 415*2d61bbb3SSatish Balay 416*2d61bbb3SSatish Balay idx = a->j; 417*2d61bbb3SSatish Balay v = a->a; 418*2d61bbb3SSatish Balay ii = a->i; 419*2d61bbb3SSatish Balay 420*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 421*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 422*2d61bbb3SSatish Balay sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 423*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 424*2d61bbb3SSatish Balay xb = x + 7*(*idx++); 425*2d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 426*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 427*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 428*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 429*2d61bbb3SSatish Balay sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 430*2d61bbb3SSatish Balay sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 431*2d61bbb3SSatish Balay sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 432*2d61bbb3SSatish Balay sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 433*2d61bbb3SSatish Balay v += 49; 434*2d61bbb3SSatish Balay } 435*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 436*2d61bbb3SSatish Balay z += 7; 437*2d61bbb3SSatish Balay } 438*2d61bbb3SSatish Balay 439*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 440*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 441*2d61bbb3SSatish Balay PLogFlops(98*a->nz - a->m); 442*2d61bbb3SSatish Balay PetscFunctionReturn(0); 443*2d61bbb3SSatish Balay } 444*2d61bbb3SSatish Balay 445*2d61bbb3SSatish Balay #undef __FUNC__ 446*2d61bbb3SSatish Balay #define __FUNC__ "MatMult_SeqBAIJ_N" 447*2d61bbb3SSatish Balay int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 448*2d61bbb3SSatish Balay { 449*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 450*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb; 451*2d61bbb3SSatish Balay Scalar _DOne = 1.0,*work,*workt,_DZero = 0.0; 452*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2; 453*2d61bbb3SSatish Balay int _One = 1,ncols,k; 454*2d61bbb3SSatish Balay 455*2d61bbb3SSatish Balay PetscFunctionBegin; 456*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 457*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 458*2d61bbb3SSatish Balay 459*2d61bbb3SSatish Balay idx = a->j; 460*2d61bbb3SSatish Balay v = a->a; 461*2d61bbb3SSatish Balay ii = a->i; 462218c64b6SSatish Balay 463218c64b6SSatish Balay 464*2d61bbb3SSatish Balay if (!a->mult_work) { 465*2d61bbb3SSatish Balay k = PetscMax(a->m,a->n); 466*2d61bbb3SSatish Balay a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work); 467*2d61bbb3SSatish Balay } 468*2d61bbb3SSatish Balay work = a->mult_work; 469*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 470*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 471*2d61bbb3SSatish Balay ncols = n*bs; 472*2d61bbb3SSatish Balay workt = work; 473*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 474*2d61bbb3SSatish Balay xb = x + bs*(*idx++); 475*2d61bbb3SSatish Balay for ( k=0; k<bs; k++ ) workt[k] = xb[k]; 476*2d61bbb3SSatish Balay workt += bs; 477*2d61bbb3SSatish Balay } 478*2d61bbb3SSatish Balay LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); 479*2d61bbb3SSatish Balay v += n*bs2; 480*2d61bbb3SSatish Balay z += bs; 481*2d61bbb3SSatish Balay } 482*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 483*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 484*2d61bbb3SSatish Balay PLogFlops(2*a->nz*bs2 - a->m); 485*2d61bbb3SSatish Balay PetscFunctionReturn(0); 486*2d61bbb3SSatish Balay } 487*2d61bbb3SSatish Balay 488*2d61bbb3SSatish Balay #undef __FUNC__ 489*2d61bbb3SSatish Balay #define __FUNC__ "MatMultAdd_SeqBAIJ_1" 490*2d61bbb3SSatish Balay int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 491*2d61bbb3SSatish Balay { 492*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 493*2d61bbb3SSatish Balay register Scalar *x,*y,*z,*v,sum; 494*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,n; 495*2d61bbb3SSatish Balay 496*2d61bbb3SSatish Balay PetscFunctionBegin; 497*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 498*2d61bbb3SSatish Balay VecGetArray_Fast(yy,y); 499*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 500*2d61bbb3SSatish Balay 501*2d61bbb3SSatish Balay idx = a->j; 502*2d61bbb3SSatish Balay v = a->a; 503*2d61bbb3SSatish Balay ii = a->i; 504*2d61bbb3SSatish Balay 505*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 506*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 507*2d61bbb3SSatish Balay sum = y[i]; 508*2d61bbb3SSatish Balay while (n--) sum += *v++ * x[*idx++]; 509*2d61bbb3SSatish Balay z[i] = sum; 510*2d61bbb3SSatish Balay } 511*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 512*2d61bbb3SSatish Balay VecRestoreArray_Fast(yy,y); 513*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 514*2d61bbb3SSatish Balay PLogFlops(2*a->nz); 515*2d61bbb3SSatish Balay PetscFunctionReturn(0); 516*2d61bbb3SSatish Balay } 517*2d61bbb3SSatish Balay 518*2d61bbb3SSatish Balay #undef __FUNC__ 519*2d61bbb3SSatish Balay #define __FUNC__ "MatMultAdd_SeqBAIJ_2" 520*2d61bbb3SSatish Balay int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 521*2d61bbb3SSatish Balay { 522*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 523*2d61bbb3SSatish Balay register Scalar *x,*y,*z,*v,*xb,sum1,sum2; 524*2d61bbb3SSatish Balay register Scalar x1,x2; 525*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 526*2d61bbb3SSatish Balay 527*2d61bbb3SSatish Balay PetscFunctionBegin; 528*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 529*2d61bbb3SSatish Balay VecGetArray_Fast(yy,y); 530*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 531*2d61bbb3SSatish Balay 532*2d61bbb3SSatish Balay idx = a->j; 533*2d61bbb3SSatish Balay v = a->a; 534*2d61bbb3SSatish Balay ii = a->i; 535*2d61bbb3SSatish Balay 536*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 537*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 538*2d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; 539*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 540*2d61bbb3SSatish Balay xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 541*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[2]*x2; 542*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[3]*x2; 543*2d61bbb3SSatish Balay v += 4; 544*2d61bbb3SSatish Balay } 545*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; 546*2d61bbb3SSatish Balay z += 2; y += 2; 547*2d61bbb3SSatish Balay } 548*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 549*2d61bbb3SSatish Balay VecRestoreArray_Fast(yy,y); 550*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 551*2d61bbb3SSatish Balay PLogFlops(4*a->nz); 552*2d61bbb3SSatish Balay PetscFunctionReturn(0); 553*2d61bbb3SSatish Balay } 554*2d61bbb3SSatish Balay 555*2d61bbb3SSatish Balay #undef __FUNC__ 556*2d61bbb3SSatish Balay #define __FUNC__ "MatMultAdd_SeqBAIJ_3" 557*2d61bbb3SSatish Balay int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 558*2d61bbb3SSatish Balay { 559*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 560*2d61bbb3SSatish Balay register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3; 561*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 562*2d61bbb3SSatish Balay 563*2d61bbb3SSatish Balay PetscFunctionBegin; 564*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 565*2d61bbb3SSatish Balay VecGetArray_Fast(yy,y); 566*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 567*2d61bbb3SSatish Balay 568*2d61bbb3SSatish Balay idx = a->j; 569*2d61bbb3SSatish Balay v = a->a; 570*2d61bbb3SSatish Balay ii = a->i; 571*2d61bbb3SSatish Balay 572*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 573*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 574*2d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 575*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 576*2d61bbb3SSatish Balay xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 577*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 578*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 579*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 580*2d61bbb3SSatish Balay v += 9; 581*2d61bbb3SSatish Balay } 582*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; 583*2d61bbb3SSatish Balay z += 3; y += 3; 584*2d61bbb3SSatish Balay } 585*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 586*2d61bbb3SSatish Balay VecRestoreArray_Fast(yy,y); 587*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 588*2d61bbb3SSatish Balay PLogFlops(18*a->nz); 589*2d61bbb3SSatish Balay PetscFunctionReturn(0); 590*2d61bbb3SSatish Balay } 591*2d61bbb3SSatish Balay 592*2d61bbb3SSatish Balay #undef __FUNC__ 593*2d61bbb3SSatish Balay #define __FUNC__ "MatMultAdd_SeqBAIJ_4" 594*2d61bbb3SSatish Balay int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 595*2d61bbb3SSatish Balay { 596*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 597*2d61bbb3SSatish Balay register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4; 598*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii; 599*2d61bbb3SSatish Balay int j,n; 600*2d61bbb3SSatish Balay 601*2d61bbb3SSatish Balay PetscFunctionBegin; 602*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 603*2d61bbb3SSatish Balay VecGetArray_Fast(yy,y); 604*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 605*2d61bbb3SSatish Balay 606*2d61bbb3SSatish Balay idx = a->j; 607*2d61bbb3SSatish Balay v = a->a; 608*2d61bbb3SSatish Balay ii = a->i; 609*2d61bbb3SSatish Balay 610*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 611*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 612*2d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 613*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 614*2d61bbb3SSatish Balay xb = x + 4*(*idx++); 615*2d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 616*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 617*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 618*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 619*2d61bbb3SSatish Balay sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 620*2d61bbb3SSatish Balay v += 16; 621*2d61bbb3SSatish Balay } 622*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 623*2d61bbb3SSatish Balay z += 4; y += 4; 624*2d61bbb3SSatish Balay } 625*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 626*2d61bbb3SSatish Balay VecRestoreArray_Fast(yy,y); 627*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 628*2d61bbb3SSatish Balay PLogFlops(32*a->nz); 629*2d61bbb3SSatish Balay PetscFunctionReturn(0); 630*2d61bbb3SSatish Balay } 631*2d61bbb3SSatish Balay 632*2d61bbb3SSatish Balay #undef __FUNC__ 633*2d61bbb3SSatish Balay #define __FUNC__ "MatMultAdd_SeqBAIJ_5" 634*2d61bbb3SSatish Balay int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 635*2d61bbb3SSatish Balay { 636*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 637*2d61bbb3SSatish Balay register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 638*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 639*2d61bbb3SSatish Balay 640*2d61bbb3SSatish Balay PetscFunctionBegin; 641*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 642*2d61bbb3SSatish Balay VecGetArray_Fast(yy,y); 643*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 644*2d61bbb3SSatish Balay 645*2d61bbb3SSatish Balay idx = a->j; 646*2d61bbb3SSatish Balay v = a->a; 647*2d61bbb3SSatish Balay ii = a->i; 648*2d61bbb3SSatish Balay 649*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 650*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 651*2d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 652*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 653*2d61bbb3SSatish Balay xb = x + 5*(*idx++); 654*2d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 655*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 656*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 657*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 658*2d61bbb3SSatish Balay sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 659*2d61bbb3SSatish Balay sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 660*2d61bbb3SSatish Balay v += 25; 661*2d61bbb3SSatish Balay } 662*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 663*2d61bbb3SSatish Balay z += 5; y += 5; 664*2d61bbb3SSatish Balay } 665*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 666*2d61bbb3SSatish Balay VecRestoreArray_Fast(yy,y); 667*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 668*2d61bbb3SSatish Balay PLogFlops(50*a->nz); 669*2d61bbb3SSatish Balay PetscFunctionReturn(0); 670*2d61bbb3SSatish Balay } 671*2d61bbb3SSatish Balay 672*2d61bbb3SSatish Balay #undef __FUNC__ 673*2d61bbb3SSatish Balay #define __FUNC__ "MatMultAdd_SeqBAIJ_7" 674*2d61bbb3SSatish Balay int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 675*2d61bbb3SSatish Balay { 676*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 677*2d61bbb3SSatish Balay register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 678*2d61bbb3SSatish Balay register Scalar x1,x2,x3,x4,x5,x6,x7; 679*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,j,n; 680*2d61bbb3SSatish Balay 681*2d61bbb3SSatish Balay PetscFunctionBegin; 682*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 683*2d61bbb3SSatish Balay VecGetArray_Fast(yy,y); 684*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 685*2d61bbb3SSatish Balay 686*2d61bbb3SSatish Balay idx = a->j; 687*2d61bbb3SSatish Balay v = a->a; 688*2d61bbb3SSatish Balay ii = a->i; 689*2d61bbb3SSatish Balay 690*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 691*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 692*2d61bbb3SSatish Balay sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 693*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 694*2d61bbb3SSatish Balay xb = x + 7*(*idx++); 695*2d61bbb3SSatish Balay x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 696*2d61bbb3SSatish Balay sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 697*2d61bbb3SSatish Balay sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 698*2d61bbb3SSatish Balay sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 699*2d61bbb3SSatish Balay sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 700*2d61bbb3SSatish Balay sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 701*2d61bbb3SSatish Balay sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 702*2d61bbb3SSatish Balay sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 703*2d61bbb3SSatish Balay v += 49; 704*2d61bbb3SSatish Balay } 705*2d61bbb3SSatish Balay z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 706*2d61bbb3SSatish Balay z += 7; y += 7; 707*2d61bbb3SSatish Balay } 708*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 709*2d61bbb3SSatish Balay VecRestoreArray_Fast(yy,y); 710*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 711*2d61bbb3SSatish Balay PLogFlops(98*a->nz); 712*2d61bbb3SSatish Balay PetscFunctionReturn(0); 713*2d61bbb3SSatish Balay } 714218c64b6SSatish Balay 715218c64b6SSatish Balay 716*2d61bbb3SSatish Balay #undef __FUNC__ 717*2d61bbb3SSatish Balay #define __FUNC__ "MatMultAdd_SeqBAIJ_N" 718*2d61bbb3SSatish Balay int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 719*2d61bbb3SSatish Balay { 720*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 721*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb; 722*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr; 723*2d61bbb3SSatish Balay int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 724218c64b6SSatish Balay 725*2d61bbb3SSatish Balay PetscFunctionBegin; 726*2d61bbb3SSatish Balay if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); } 727*2d61bbb3SSatish Balay 728*2d61bbb3SSatish Balay VecGetArray_Fast(xx,x); 729*2d61bbb3SSatish Balay VecGetArray_Fast(zz,z); 730*2d61bbb3SSatish Balay 731*2d61bbb3SSatish Balay idx = a->j; 732*2d61bbb3SSatish Balay v = a->a; 733*2d61bbb3SSatish Balay ii = a->i; 734*2d61bbb3SSatish Balay 735*2d61bbb3SSatish Balay 736*2d61bbb3SSatish Balay if (!a->mult_work) { 737*2d61bbb3SSatish Balay k = PetscMax(a->m,a->n); 738*2d61bbb3SSatish Balay a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work); 739*2d61bbb3SSatish Balay } 740*2d61bbb3SSatish Balay work = a->mult_work; 741*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 742*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 743*2d61bbb3SSatish Balay ncols = n*bs; 744*2d61bbb3SSatish Balay workt = work; 745*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 746*2d61bbb3SSatish Balay xb = x + bs*(*idx++); 747*2d61bbb3SSatish Balay for ( k=0; k<bs; k++ ) workt[k] = xb[k]; 748*2d61bbb3SSatish Balay workt += bs; 749*2d61bbb3SSatish Balay } 750*2d61bbb3SSatish Balay LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); 751*2d61bbb3SSatish Balay v += n*bs2; 752*2d61bbb3SSatish Balay z += bs; 753*2d61bbb3SSatish Balay } 754*2d61bbb3SSatish Balay VecRestoreArray_Fast(xx,x); 755*2d61bbb3SSatish Balay VecRestoreArray_Fast(zz,z); 756*2d61bbb3SSatish Balay PLogFlops(2*a->nz*bs2 ); 757*2d61bbb3SSatish Balay PetscFunctionReturn(0); 758*2d61bbb3SSatish Balay } 759*2d61bbb3SSatish Balay 760*2d61bbb3SSatish Balay #undef __FUNC__ 761*2d61bbb3SSatish Balay #define __FUNC__ "MatMultTrans_SeqBAIJ" 762*2d61bbb3SSatish Balay int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz) 763*2d61bbb3SSatish Balay { 764*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 765*2d61bbb3SSatish Balay Scalar *xg,*zg,*zb; 766*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5; 767*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n; 768*2d61bbb3SSatish Balay int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 769*2d61bbb3SSatish Balay 770*2d61bbb3SSatish Balay 771*2d61bbb3SSatish Balay PetscFunctionBegin; 772*2d61bbb3SSatish Balay VecGetArray_Fast(xx,xg); x = xg; 773*2d61bbb3SSatish Balay VecGetArray_Fast(zz,zg); z = zg; 774*2d61bbb3SSatish Balay PetscMemzero(z,N*sizeof(Scalar)); 775*2d61bbb3SSatish Balay 776*2d61bbb3SSatish Balay idx = a->j; 777*2d61bbb3SSatish Balay v = a->a; 778*2d61bbb3SSatish Balay ii = a->i; 779*2d61bbb3SSatish Balay 780*2d61bbb3SSatish Balay switch (bs) { 781*2d61bbb3SSatish Balay case 1: 782*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 783*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 784*2d61bbb3SSatish Balay xb = x + i; x1 = xb[0]; 785*2d61bbb3SSatish Balay ib = idx + ai[i]; 786*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 787*2d61bbb3SSatish Balay rval = ib[j]; 788*2d61bbb3SSatish Balay z[rval] += *v++ * x1; 789*2d61bbb3SSatish Balay } 790*2d61bbb3SSatish Balay } 791*2d61bbb3SSatish Balay break; 792*2d61bbb3SSatish Balay case 2: 793*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 794*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 795*2d61bbb3SSatish Balay xb = x + 2*i; x1 = xb[0]; x2 = xb[1]; 796*2d61bbb3SSatish Balay ib = idx + ai[i]; 797*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 798*2d61bbb3SSatish Balay rval = ib[j]*2; 799*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2; 800*2d61bbb3SSatish Balay z[rval++] += v[2]*x1 + v[3]*x2; 801*2d61bbb3SSatish Balay v += 4; 802*2d61bbb3SSatish Balay } 803*2d61bbb3SSatish Balay } 804*2d61bbb3SSatish Balay break; 805*2d61bbb3SSatish Balay case 3: 806*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 807*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 808*2d61bbb3SSatish Balay xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 809*2d61bbb3SSatish Balay ib = idx + ai[i]; 810*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 811*2d61bbb3SSatish Balay rval = ib[j]*3; 812*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 813*2d61bbb3SSatish Balay z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 814*2d61bbb3SSatish Balay z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 815*2d61bbb3SSatish Balay v += 9; 816*2d61bbb3SSatish Balay } 817*2d61bbb3SSatish Balay } 818*2d61bbb3SSatish Balay break; 819*2d61bbb3SSatish Balay case 4: 820*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 821*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 822*2d61bbb3SSatish Balay xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 823*2d61bbb3SSatish Balay ib = idx + ai[i]; 824*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 825*2d61bbb3SSatish Balay rval = ib[j]*4; 826*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 827*2d61bbb3SSatish Balay z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 828*2d61bbb3SSatish Balay z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 829*2d61bbb3SSatish Balay z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 830*2d61bbb3SSatish Balay v += 16; 831*2d61bbb3SSatish Balay } 832*2d61bbb3SSatish Balay } 833*2d61bbb3SSatish Balay break; 834*2d61bbb3SSatish Balay case 5: 835*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 836*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 837*2d61bbb3SSatish Balay xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 838*2d61bbb3SSatish Balay x4 = xb[3]; x5 = xb[4]; 839*2d61bbb3SSatish Balay ib = idx + ai[i]; 840*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 841*2d61bbb3SSatish Balay rval = ib[j]*5; 842*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 843*2d61bbb3SSatish Balay z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 844*2d61bbb3SSatish Balay z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 845*2d61bbb3SSatish Balay z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 846*2d61bbb3SSatish Balay z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 847*2d61bbb3SSatish Balay v += 25; 848*2d61bbb3SSatish Balay } 849*2d61bbb3SSatish Balay } 850*2d61bbb3SSatish Balay break; 851*2d61bbb3SSatish Balay /* block sizes larger then 5 by 5 are handled by BLAS */ 852*2d61bbb3SSatish Balay default: { 853*2d61bbb3SSatish Balay int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 854*2d61bbb3SSatish Balay if (!a->mult_work) { 855*2d61bbb3SSatish Balay k = PetscMax(a->m,a->n); 856*2d61bbb3SSatish Balay a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar)); 857*2d61bbb3SSatish Balay CHKPTRQ(a->mult_work); 858*2d61bbb3SSatish Balay } 859*2d61bbb3SSatish Balay work = a->mult_work; 860*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 861*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 862*2d61bbb3SSatish Balay ncols = n*bs; 863*2d61bbb3SSatish Balay PetscMemzero(work,ncols*sizeof(Scalar)); 864*2d61bbb3SSatish Balay LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); 865*2d61bbb3SSatish Balay v += n*bs2; 866*2d61bbb3SSatish Balay x += bs; 867*2d61bbb3SSatish Balay workt = work; 868*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 869*2d61bbb3SSatish Balay zb = z + bs*(*idx++); 870*2d61bbb3SSatish Balay for ( k=0; k<bs; k++ ) zb[k] += workt[k] ; 871*2d61bbb3SSatish Balay workt += bs; 872*2d61bbb3SSatish Balay } 873*2d61bbb3SSatish Balay } 874*2d61bbb3SSatish Balay } 875*2d61bbb3SSatish Balay } 876*2d61bbb3SSatish Balay ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr); 877*2d61bbb3SSatish Balay ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr); 878*2d61bbb3SSatish Balay PLogFlops(2*a->nz*a->bs2 - a->n); 879*2d61bbb3SSatish Balay PetscFunctionReturn(0); 880*2d61bbb3SSatish Balay } 881*2d61bbb3SSatish Balay 882*2d61bbb3SSatish Balay #undef __FUNC__ 883*2d61bbb3SSatish Balay #define __FUNC__ "MatMultTransAdd_SeqBAIJ" 884*2d61bbb3SSatish Balay int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 885*2d61bbb3SSatish Balay 886*2d61bbb3SSatish Balay { 887*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 888*2d61bbb3SSatish Balay Scalar *xg,*zg,*zb; 889*2d61bbb3SSatish Balay register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5; 890*2d61bbb3SSatish Balay int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n; 891*2d61bbb3SSatish Balay int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 892*2d61bbb3SSatish Balay 893*2d61bbb3SSatish Balay PetscFunctionBegin; 894*2d61bbb3SSatish Balay VecGetArray_Fast(xx,xg); x = xg; 895*2d61bbb3SSatish Balay VecGetArray_Fast(zz,zg); z = zg; 896*2d61bbb3SSatish Balay 897*2d61bbb3SSatish Balay if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); } 898*2d61bbb3SSatish Balay else PetscMemzero(z,N*sizeof(Scalar)); 899*2d61bbb3SSatish Balay 900*2d61bbb3SSatish Balay idx = a->j; 901*2d61bbb3SSatish Balay v = a->a; 902*2d61bbb3SSatish Balay ii = a->i; 903*2d61bbb3SSatish Balay 904*2d61bbb3SSatish Balay switch (bs) { 905*2d61bbb3SSatish Balay case 1: 906*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 907*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 908*2d61bbb3SSatish Balay xb = x + i; x1 = xb[0]; 909*2d61bbb3SSatish Balay ib = idx + ai[i]; 910*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 911*2d61bbb3SSatish Balay rval = ib[j]; 912*2d61bbb3SSatish Balay z[rval] += *v++ * x1; 913*2d61bbb3SSatish Balay } 914*2d61bbb3SSatish Balay } 915*2d61bbb3SSatish Balay break; 916*2d61bbb3SSatish Balay case 2: 917*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 918*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 919*2d61bbb3SSatish Balay xb = x + 2*i; x1 = xb[0]; x2 = xb[1]; 920*2d61bbb3SSatish Balay ib = idx + ai[i]; 921*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 922*2d61bbb3SSatish Balay rval = ib[j]*2; 923*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2; 924*2d61bbb3SSatish Balay z[rval++] += v[2]*x1 + v[3]*x2; 925*2d61bbb3SSatish Balay v += 4; 926*2d61bbb3SSatish Balay } 927*2d61bbb3SSatish Balay } 928*2d61bbb3SSatish Balay break; 929*2d61bbb3SSatish Balay case 3: 930*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 931*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 932*2d61bbb3SSatish Balay xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 933*2d61bbb3SSatish Balay ib = idx + ai[i]; 934*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 935*2d61bbb3SSatish Balay rval = ib[j]*3; 936*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 937*2d61bbb3SSatish Balay z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 938*2d61bbb3SSatish Balay z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 939*2d61bbb3SSatish Balay v += 9; 940*2d61bbb3SSatish Balay } 941*2d61bbb3SSatish Balay } 942*2d61bbb3SSatish Balay break; 943*2d61bbb3SSatish Balay case 4: 944*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 945*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 946*2d61bbb3SSatish Balay xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 947*2d61bbb3SSatish Balay ib = idx + ai[i]; 948*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 949*2d61bbb3SSatish Balay rval = ib[j]*4; 950*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 951*2d61bbb3SSatish Balay z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 952*2d61bbb3SSatish Balay z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 953*2d61bbb3SSatish Balay z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 954*2d61bbb3SSatish Balay v += 16; 955*2d61bbb3SSatish Balay } 956*2d61bbb3SSatish Balay } 957*2d61bbb3SSatish Balay break; 958*2d61bbb3SSatish Balay case 5: 959*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 960*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 961*2d61bbb3SSatish Balay xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 962*2d61bbb3SSatish Balay x4 = xb[3]; x5 = xb[4]; 963*2d61bbb3SSatish Balay ib = idx + ai[i]; 964*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 965*2d61bbb3SSatish Balay rval = ib[j]*5; 966*2d61bbb3SSatish Balay z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 967*2d61bbb3SSatish Balay z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 968*2d61bbb3SSatish Balay z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 969*2d61bbb3SSatish Balay z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 970*2d61bbb3SSatish Balay z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 971*2d61bbb3SSatish Balay v += 25; 972*2d61bbb3SSatish Balay } 973*2d61bbb3SSatish Balay } 974*2d61bbb3SSatish Balay break; 975*2d61bbb3SSatish Balay /* block sizes larger then 5 by 5 are handled by BLAS */ 976*2d61bbb3SSatish Balay default: { 977*2d61bbb3SSatish Balay int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 978*2d61bbb3SSatish Balay if (!a->mult_work) { 979*2d61bbb3SSatish Balay k = PetscMax(a->m,a->n); 980*2d61bbb3SSatish Balay a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar)); 981*2d61bbb3SSatish Balay CHKPTRQ(a->mult_work); 982*2d61bbb3SSatish Balay } 983*2d61bbb3SSatish Balay work = a->mult_work; 984*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { 985*2d61bbb3SSatish Balay n = ii[1] - ii[0]; ii++; 986*2d61bbb3SSatish Balay ncols = n*bs; 987*2d61bbb3SSatish Balay PetscMemzero(work,ncols*sizeof(Scalar)); 988*2d61bbb3SSatish Balay LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); 989*2d61bbb3SSatish Balay v += n*bs2; 990*2d61bbb3SSatish Balay x += bs; 991*2d61bbb3SSatish Balay workt = work; 992*2d61bbb3SSatish Balay for ( j=0; j<n; j++ ) { 993*2d61bbb3SSatish Balay zb = z + bs*(*idx++); 994*2d61bbb3SSatish Balay for ( k=0; k<bs; k++ ) zb[k] += workt[k] ; 995*2d61bbb3SSatish Balay workt += bs; 996*2d61bbb3SSatish Balay } 997*2d61bbb3SSatish Balay } 998*2d61bbb3SSatish Balay } 999*2d61bbb3SSatish Balay } 1000*2d61bbb3SSatish Balay ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr); 1001*2d61bbb3SSatish Balay ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr); 1002*2d61bbb3SSatish Balay PLogFlops(2*a->nz*a->bs2); 1003*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1004*2d61bbb3SSatish Balay } 1005*2d61bbb3SSatish Balay 1006*2d61bbb3SSatish Balay #undef __FUNC__ 1007*2d61bbb3SSatish Balay #define __FUNC__ "MatScale_SeqBAIJ" 1008*2d61bbb3SSatish Balay int MatScale_SeqBAIJ(Scalar *alpha,Mat inA) 1009*2d61bbb3SSatish Balay { 1010*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data; 1011*2d61bbb3SSatish Balay int one = 1, totalnz = a->bs2*a->nz; 1012*2d61bbb3SSatish Balay 1013*2d61bbb3SSatish Balay PetscFunctionBegin; 1014*2d61bbb3SSatish Balay BLscal_( &totalnz, alpha, a->a, &one ); 1015*2d61bbb3SSatish Balay PLogFlops(totalnz); 1016*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1017*2d61bbb3SSatish Balay } 1018*2d61bbb3SSatish Balay 1019*2d61bbb3SSatish Balay #undef __FUNC__ 1020*2d61bbb3SSatish Balay #define __FUNC__ "MatNorm_SeqBAIJ" 1021*2d61bbb3SSatish Balay int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm) 1022*2d61bbb3SSatish Balay { 1023*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1024*2d61bbb3SSatish Balay Scalar *v = a->a; 1025*2d61bbb3SSatish Balay double sum = 0.0; 1026*2d61bbb3SSatish Balay int i,nz=a->nz,bs2=a->bs2; 1027*2d61bbb3SSatish Balay 1028*2d61bbb3SSatish Balay PetscFunctionBegin; 1029*2d61bbb3SSatish Balay if (type == NORM_FROBENIUS) { 1030*2d61bbb3SSatish Balay for (i=0; i< bs2*nz; i++ ) { 1031*2d61bbb3SSatish Balay #if defined(USE_PETSC_COMPLEX) 1032*2d61bbb3SSatish Balay sum += real(conj(*v)*(*v)); v++; 1033*2d61bbb3SSatish Balay #else 1034*2d61bbb3SSatish Balay sum += (*v)*(*v); v++; 1035*2d61bbb3SSatish Balay #endif 1036*2d61bbb3SSatish Balay } 1037*2d61bbb3SSatish Balay *norm = sqrt(sum); 1038*2d61bbb3SSatish Balay } 1039*2d61bbb3SSatish Balay else { 1040*2d61bbb3SSatish Balay SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet"); 1041*2d61bbb3SSatish Balay } 1042*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1043*2d61bbb3SSatish Balay } 1044*2d61bbb3SSatish Balay 1045*2d61bbb3SSatish Balay 1046*2d61bbb3SSatish Balay #undef __FUNC__ 1047*2d61bbb3SSatish Balay #define __FUNC__ "MatEqual_SeqBAIJ" 1048*2d61bbb3SSatish Balay int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg) 1049*2d61bbb3SSatish Balay { 1050*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 1051*2d61bbb3SSatish Balay 1052*2d61bbb3SSatish Balay PetscFunctionBegin; 1053*2d61bbb3SSatish Balay if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type"); 1054*2d61bbb3SSatish Balay 1055*2d61bbb3SSatish Balay /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1056*2d61bbb3SSatish Balay if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1057*2d61bbb3SSatish Balay *flg = PETSC_FALSE; PetscFunctionReturn(0); 1058*2d61bbb3SSatish Balay } 1059*2d61bbb3SSatish Balay 1060*2d61bbb3SSatish Balay /* if the a->i are the same */ 1061*2d61bbb3SSatish Balay if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) { 1062*2d61bbb3SSatish Balay *flg = PETSC_FALSE; PetscFunctionReturn(0); 1063*2d61bbb3SSatish Balay } 1064*2d61bbb3SSatish Balay 1065*2d61bbb3SSatish Balay /* if a->j are the same */ 1066*2d61bbb3SSatish Balay if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) { 1067*2d61bbb3SSatish Balay *flg = PETSC_FALSE; PetscFunctionReturn(0); 1068*2d61bbb3SSatish Balay } 1069*2d61bbb3SSatish Balay 1070*2d61bbb3SSatish Balay /* if a->a are the same */ 1071*2d61bbb3SSatish Balay if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) { 1072*2d61bbb3SSatish Balay *flg = PETSC_FALSE; PetscFunctionReturn(0); 1073*2d61bbb3SSatish Balay } 1074*2d61bbb3SSatish Balay *flg = PETSC_TRUE; 1075*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1076*2d61bbb3SSatish Balay 1077*2d61bbb3SSatish Balay } 1078*2d61bbb3SSatish Balay 1079*2d61bbb3SSatish Balay #undef __FUNC__ 1080*2d61bbb3SSatish Balay #define __FUNC__ "MatGetDiagonal_SeqBAIJ" 1081*2d61bbb3SSatish Balay int MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1082*2d61bbb3SSatish Balay { 1083*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1084*2d61bbb3SSatish Balay int i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1085*2d61bbb3SSatish Balay Scalar *x, zero = 0.0,*aa,*aa_j; 1086*2d61bbb3SSatish Balay 1087*2d61bbb3SSatish Balay PetscFunctionBegin; 1088*2d61bbb3SSatish Balay if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix"); 1089*2d61bbb3SSatish Balay bs = a->bs; 1090*2d61bbb3SSatish Balay aa = a->a; 1091*2d61bbb3SSatish Balay ai = a->i; 1092*2d61bbb3SSatish Balay aj = a->j; 1093*2d61bbb3SSatish Balay ambs = a->mbs; 1094*2d61bbb3SSatish Balay bs2 = a->bs2; 1095*2d61bbb3SSatish Balay 1096*2d61bbb3SSatish Balay VecSet(&zero,v); 1097*2d61bbb3SSatish Balay VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n); 1098*2d61bbb3SSatish Balay if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector"); 1099*2d61bbb3SSatish Balay for ( i=0; i<ambs; i++ ) { 1100*2d61bbb3SSatish Balay for ( j=ai[i]; j<ai[i+1]; j++ ) { 1101*2d61bbb3SSatish Balay if (aj[j] == i) { 1102*2d61bbb3SSatish Balay row = i*bs; 1103*2d61bbb3SSatish Balay aa_j = aa+j*bs2; 1104*2d61bbb3SSatish Balay for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1105*2d61bbb3SSatish Balay break; 1106*2d61bbb3SSatish Balay } 1107*2d61bbb3SSatish Balay } 1108*2d61bbb3SSatish Balay } 1109*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1110*2d61bbb3SSatish Balay } 1111*2d61bbb3SSatish Balay 1112*2d61bbb3SSatish Balay #undef __FUNC__ 1113*2d61bbb3SSatish Balay #define __FUNC__ "MatDiagonalScale_SeqBAIJ" 1114*2d61bbb3SSatish Balay int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 1115*2d61bbb3SSatish Balay { 1116*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1117*2d61bbb3SSatish Balay Scalar *l,*r,x,*v,*aa,*li,*ri; 1118*2d61bbb3SSatish Balay int i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 1119*2d61bbb3SSatish Balay 1120*2d61bbb3SSatish Balay PetscFunctionBegin; 1121*2d61bbb3SSatish Balay ai = a->i; 1122*2d61bbb3SSatish Balay aj = a->j; 1123*2d61bbb3SSatish Balay aa = a->a; 1124*2d61bbb3SSatish Balay m = a->m; 1125*2d61bbb3SSatish Balay n = a->n; 1126*2d61bbb3SSatish Balay bs = a->bs; 1127*2d61bbb3SSatish Balay mbs = a->mbs; 1128*2d61bbb3SSatish Balay bs2 = a->bs2; 1129*2d61bbb3SSatish Balay if (ll) { 1130*2d61bbb3SSatish Balay VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm); 1131*2d61bbb3SSatish Balay if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length"); 1132*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { /* for each block row */ 1133*2d61bbb3SSatish Balay M = ai[i+1] - ai[i]; 1134*2d61bbb3SSatish Balay li = l + i*bs; 1135*2d61bbb3SSatish Balay v = aa + bs2*ai[i]; 1136*2d61bbb3SSatish Balay for ( j=0; j<M; j++ ) { /* for each block */ 1137*2d61bbb3SSatish Balay for ( k=0; k<bs2; k++ ) { 1138*2d61bbb3SSatish Balay (*v++) *= li[k%bs]; 1139*2d61bbb3SSatish Balay } 1140*2d61bbb3SSatish Balay } 1141*2d61bbb3SSatish Balay } 1142*2d61bbb3SSatish Balay } 1143*2d61bbb3SSatish Balay 1144*2d61bbb3SSatish Balay if (rr) { 1145*2d61bbb3SSatish Balay VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn); 1146*2d61bbb3SSatish Balay if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length"); 1147*2d61bbb3SSatish Balay for ( i=0; i<mbs; i++ ) { /* for each block row */ 1148*2d61bbb3SSatish Balay M = ai[i+1] - ai[i]; 1149*2d61bbb3SSatish Balay v = aa + bs2*ai[i]; 1150*2d61bbb3SSatish Balay for ( j=0; j<M; j++ ) { /* for each block */ 1151*2d61bbb3SSatish Balay ri = r + bs*aj[ai[i]+j]; 1152*2d61bbb3SSatish Balay for ( k=0; k<bs; k++ ) { 1153*2d61bbb3SSatish Balay x = ri[k]; 1154*2d61bbb3SSatish Balay for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x; 1155*2d61bbb3SSatish Balay } 1156*2d61bbb3SSatish Balay } 1157*2d61bbb3SSatish Balay } 1158*2d61bbb3SSatish Balay } 1159*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1160*2d61bbb3SSatish Balay } 1161*2d61bbb3SSatish Balay 1162*2d61bbb3SSatish Balay 1163*2d61bbb3SSatish Balay #undef __FUNC__ 1164*2d61bbb3SSatish Balay #define __FUNC__ "MatGetInfo_SeqBAIJ" 1165*2d61bbb3SSatish Balay int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1166*2d61bbb3SSatish Balay { 1167*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1168*2d61bbb3SSatish Balay 1169*2d61bbb3SSatish Balay PetscFunctionBegin; 1170*2d61bbb3SSatish Balay info->rows_global = (double)a->m; 1171*2d61bbb3SSatish Balay info->columns_global = (double)a->n; 1172*2d61bbb3SSatish Balay info->rows_local = (double)a->m; 1173*2d61bbb3SSatish Balay info->columns_local = (double)a->n; 1174*2d61bbb3SSatish Balay info->block_size = a->bs2; 1175*2d61bbb3SSatish Balay info->nz_allocated = a->maxnz; 1176*2d61bbb3SSatish Balay info->nz_used = a->bs2*a->nz; 1177*2d61bbb3SSatish Balay info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1178*2d61bbb3SSatish Balay /* if (info->nz_unneeded != A->info.nz_unneeded) 1179*2d61bbb3SSatish Balay printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */ 1180*2d61bbb3SSatish Balay info->assemblies = A->num_ass; 1181*2d61bbb3SSatish Balay info->mallocs = a->reallocs; 1182*2d61bbb3SSatish Balay info->memory = A->mem; 1183*2d61bbb3SSatish Balay if (A->factor) { 1184*2d61bbb3SSatish Balay info->fill_ratio_given = A->info.fill_ratio_given; 1185*2d61bbb3SSatish Balay info->fill_ratio_needed = A->info.fill_ratio_needed; 1186*2d61bbb3SSatish Balay info->factor_mallocs = A->info.factor_mallocs; 1187*2d61bbb3SSatish Balay } else { 1188*2d61bbb3SSatish Balay info->fill_ratio_given = 0; 1189*2d61bbb3SSatish Balay info->fill_ratio_needed = 0; 1190*2d61bbb3SSatish Balay info->factor_mallocs = 0; 1191*2d61bbb3SSatish Balay } 1192*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1193*2d61bbb3SSatish Balay } 1194*2d61bbb3SSatish Balay 1195*2d61bbb3SSatish Balay 1196*2d61bbb3SSatish Balay #undef __FUNC__ 1197*2d61bbb3SSatish Balay #define __FUNC__ "MatZeroEntries_SeqBAIJ" 1198*2d61bbb3SSatish Balay int MatZeroEntries_SeqBAIJ(Mat A) 1199*2d61bbb3SSatish Balay { 1200*2d61bbb3SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1201*2d61bbb3SSatish Balay 1202*2d61bbb3SSatish Balay PetscFunctionBegin; 1203*2d61bbb3SSatish Balay PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar)); 1204*2d61bbb3SSatish Balay PetscFunctionReturn(0); 1205*2d61bbb3SSatish Balay } 1206