1 #ifdef PETSC_RCS_HEADER 2 static char vcid[] = "$Id: baij2.c,v 1.24 1998/03/16 18:55:40 bsmith Exp bsmith $"; 3 #endif 4 5 #include "pinclude/pviewer.h" 6 #include "sys.h" 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/vec/vecimpl.h" 9 #include "src/inline/spops.h" 10 #include "petsc.h" 11 #include "src/inline/bitarray.h" 12 13 14 #undef __FUNC__ 15 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ" 16 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov) 17 { 18 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 19 int row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival; 20 int start, end, *ai, *aj,bs,*nidx2; 21 BT table; 22 23 PetscFunctionBegin; 24 m = a->mbs; 25 ai = a->i; 26 aj = a->j; 27 bs = a->bs; 28 29 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified"); 30 31 ierr = BTCreate(m,table); CHKERRQ(ierr); 32 nidx = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx); 33 nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2); 34 35 for ( i=0; i<is_max; i++ ) { 36 /* Initialise the two local arrays */ 37 isz = 0; 38 BTMemzero(m,table); 39 40 /* Extract the indices, assume there can be duplicate entries */ 41 ierr = ISGetIndices(is[i],&idx); CHKERRQ(ierr); 42 ierr = ISGetSize(is[i],&n); CHKERRQ(ierr); 43 44 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 45 for ( j=0; j<n ; ++j){ 46 ival = idx[j]/bs; /* convert the indices into block indices */ 47 if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim"); 48 if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;} 49 } 50 ierr = ISRestoreIndices(is[i],&idx); CHKERRQ(ierr); 51 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 52 53 k = 0; 54 for ( j=0; j<ov; j++){ /* for each overlap*/ 55 n = isz; 56 for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 57 row = nidx[k]; 58 start = ai[row]; 59 end = ai[row+1]; 60 for ( l = start; l<end ; l++){ 61 val = aj[l]; 62 if (!BTLookupSet(table,val)) {nidx[isz++] = val;} 63 } 64 } 65 } 66 /* expand the Index Set */ 67 for (j=0; j<isz; j++ ) { 68 for (k=0; k<bs; k++ ) 69 nidx2[j*bs+k] = nidx[j]*bs+k; 70 } 71 ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr); 72 } 73 BTDestroy(table); 74 PetscFree(nidx); 75 PetscFree(nidx2); 76 PetscFunctionReturn(0); 77 } 78 79 #undef __FUNC__ 80 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private" 81 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B) 82 { 83 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*c; 84 int *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens; 85 int row,mat_i,*mat_j,tcol,*mat_ilen; 86 int *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2; 87 int *aj = a->j, *ai = a->i; 88 Scalar *mat_a; 89 Mat C; 90 91 PetscFunctionBegin; 92 ierr = ISSorted(iscol,(PetscTruth*)&i); 93 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted"); 94 95 ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr); 96 ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr); 97 ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr); 98 ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr); 99 100 smap = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap); 101 ssmap = smap; 102 lens = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens); 103 PetscMemzero(smap,oldcols*sizeof(int)); 104 for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1; 105 /* determine lens of each row */ 106 for (i=0; i<nrows; i++) { 107 kstart = ai[irow[i]]; 108 kend = kstart + a->ilen[irow[i]]; 109 lens[i] = 0; 110 for ( k=kstart; k<kend; k++ ) { 111 if (ssmap[aj[k]]) { 112 lens[i]++; 113 } 114 } 115 } 116 /* Create and fill new matrix */ 117 if (scall == MAT_REUSE_MATRIX) { 118 c = (Mat_SeqBAIJ *)((*B)->data); 119 120 if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size"); 121 if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) { 122 SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros"); 123 } 124 PetscMemzero(c->ilen,c->mbs*sizeof(int)); 125 C = *B; 126 } else { 127 ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr); 128 } 129 c = (Mat_SeqBAIJ *)(C->data); 130 for (i=0; i<nrows; i++) { 131 row = irow[i]; 132 kstart = ai[row]; 133 kend = kstart + a->ilen[row]; 134 mat_i = c->i[i]; 135 mat_j = c->j + mat_i; 136 mat_a = c->a + mat_i*bs2; 137 mat_ilen = c->ilen + i; 138 for ( k=kstart; k<kend; k++ ) { 139 if ((tcol=ssmap[a->j[k]])) { 140 *mat_j++ = tcol - 1; 141 PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2; 142 (*mat_ilen)++; 143 } 144 } 145 } 146 147 /* Free work space */ 148 ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr); 149 PetscFree(smap); PetscFree(lens); 150 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 151 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 152 153 ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr); 154 *B = C; 155 PetscFunctionReturn(0); 156 } 157 158 #undef __FUNC__ 159 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ" 160 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B) 161 { 162 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 163 IS is1,is2; 164 int *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count; 165 166 PetscFunctionBegin; 167 ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr); 168 ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr); 169 ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr); 170 ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr); 171 172 /* Verify if the indices corespond to each element in a block 173 and form the IS with compressed IS */ 174 vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary); 175 iary = vary + a->mbs; 176 PetscMemzero(vary,(a->mbs)*sizeof(int)); 177 for ( i=0; i<nrows; i++) vary[irow[i]/bs]++; 178 count = 0; 179 for (i=0; i<a->mbs; i++) { 180 if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks"); 181 if (vary[i]==bs) iary[count++] = i; 182 } 183 ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr); 184 185 PetscMemzero(vary,(a->mbs)*sizeof(int)); 186 for ( i=0; i<ncols; i++) vary[icol[i]/bs]++; 187 count = 0; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:"); 190 if (vary[i]==bs) iary[count++] = i; 191 } 192 ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr); 193 ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr); 194 ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr); 195 PetscFree(vary); 196 197 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr); 198 ISDestroy(is1); 199 ISDestroy(is2); 200 PetscFunctionReturn(0); 201 } 202 203 #undef __FUNC__ 204 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ" 205 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall,Mat **B) 206 { 207 int ierr,i; 208 209 PetscFunctionBegin; 210 if (scall == MAT_INITIAL_MATRIX) { 211 *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B); 212 } 213 214 for ( i=0; i<n; i++ ) { 215 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 216 } 217 PetscFunctionReturn(0); 218 } 219 220 221 /* -------------------------------------------------------*/ 222 /* Should check that shapes of vectors and matrices match */ 223 /* -------------------------------------------------------*/ 224 #include "pinclude/plapack.h" 225 226 #undef __FUNC__ 227 #define __FUNC__ "MatMult_SeqBAIJ_1" 228 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 229 { 230 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 231 Scalar *x,*z,*v,sum; 232 int mbs=a->mbs,i,*idx,*ii,n,ierr; 233 234 PetscFunctionBegin; 235 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 236 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 237 238 idx = a->j; 239 v = a->a; 240 ii = a->i; 241 242 for ( i=0; i<mbs; i++ ) { 243 n = ii[1] - ii[0]; ii++; 244 sum = 0.0; 245 while (n--) sum += *v++ * x[*idx++]; 246 z[i] = sum; 247 } 248 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 249 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 250 PLogFlops(2*a->nz - a->m); 251 PetscFunctionReturn(0); 252 } 253 254 #undef __FUNC__ 255 #define __FUNC__ "MatMult_SeqBAIJ_2" 256 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 257 { 258 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 259 Scalar *x,*z,*v,*xb,sum1,sum2; 260 Scalar x1,x2; 261 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 262 263 PetscFunctionBegin; 264 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 265 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 266 267 idx = a->j; 268 v = a->a; 269 ii = a->i; 270 271 for ( i=0; i<mbs; i++ ) { 272 n = ii[1] - ii[0]; ii++; 273 sum1 = 0.0; sum2 = 0.0; 274 for ( j=0; j<n; j++ ) { 275 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 276 sum1 += v[0]*x1 + v[2]*x2; 277 sum2 += v[1]*x1 + v[3]*x2; 278 v += 4; 279 } 280 z[0] = sum1; z[1] = sum2; 281 z += 2; 282 } 283 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 284 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 285 PLogFlops(4*a->nz - a->m); 286 PetscFunctionReturn(0); 287 } 288 289 #undef __FUNC__ 290 #define __FUNC__ "MatMult_SeqBAIJ_3" 291 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 292 { 293 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 294 Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3; 295 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 296 297 PetscFunctionBegin; 298 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 299 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 300 301 idx = a->j; 302 v = a->a; 303 ii = a->i; 304 305 for ( i=0; i<mbs; i++ ) { 306 n = ii[1] - ii[0]; ii++; 307 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 308 for ( j=0; j<n; j++ ) { 309 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 310 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 311 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 312 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 313 v += 9; 314 } 315 z[0] = sum1; z[1] = sum2; z[2] = sum3; 316 z += 3; 317 } 318 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 319 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 320 PLogFlops(18*a->nz - a->m); 321 PetscFunctionReturn(0); 322 } 323 324 #undef __FUNC__ 325 #define __FUNC__ "MatMult_SeqBAIJ_4" 326 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 327 { 328 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 329 Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4; 330 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 331 332 PetscFunctionBegin; 333 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 334 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 335 336 idx = a->j; 337 v = a->a; 338 ii = a->i; 339 340 for ( i=0; i<mbs; i++ ) { 341 n = ii[1] - ii[0]; ii++; 342 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 343 for ( j=0; j<n; j++ ) { 344 xb = x + 4*(*idx++); 345 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 346 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 347 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 348 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 349 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 350 v += 16; 351 } 352 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 353 z += 4; 354 } 355 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 356 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 357 PLogFlops(32*a->nz - a->m); 358 PetscFunctionReturn(0); 359 } 360 361 #undef __FUNC__ 362 #define __FUNC__ "MatMult_SeqBAIJ_5" 363 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 364 { 365 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 366 Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 367 Scalar * v,* xb,* z, * x; 368 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 369 370 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 371 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 372 373 idx = a->j; 374 v = a->a; 375 ii = a->i; 376 377 for ( i=0; i<mbs; i++ ) { 378 n = ii[1] - ii[0]; ii++; 379 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 380 for ( j=0; j<n; j++ ) { 381 xb = x + 5*(*idx++); 382 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 383 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 384 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 385 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 386 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 387 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 388 v += 25; 389 } 390 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 391 z += 5; 392 } 393 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 394 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 395 PLogFlops(50*a->nz - a->m); 396 PetscFunctionReturn(0); 397 } 398 399 #undef __FUNC__ 400 #define __FUNC__ "MatMult_SeqBAIJ_7" 401 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 402 { 403 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 404 Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 405 Scalar x1,x2,x3,x4,x5,x6,x7; 406 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 407 408 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 409 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 410 411 idx = a->j; 412 v = a->a; 413 ii = a->i; 414 415 for ( i=0; i<mbs; i++ ) { 416 n = ii[1] - ii[0]; ii++; 417 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 418 for ( j=0; j<n; j++ ) { 419 xb = x + 7*(*idx++); 420 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 421 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 422 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 423 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 424 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 425 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 426 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 427 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 428 v += 49; 429 } 430 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 431 z += 7; 432 } 433 434 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 435 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 436 PLogFlops(98*a->nz - a->m); 437 PetscFunctionReturn(0); 438 } 439 440 #undef __FUNC__ 441 #define __FUNC__ "MatMult_SeqBAIJ_N" 442 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 443 { 444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 445 Scalar *x,*z,*v,*xb; 446 Scalar _DOne = 1.0,*work,*workt,_DZero = 0.0; 447 int ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2; 448 int _One = 1,ncols,k; 449 450 PetscFunctionBegin; 451 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 452 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 453 454 idx = a->j; 455 v = a->a; 456 ii = a->i; 457 458 459 if (!a->mult_work) { 460 k = PetscMax(a->m,a->n); 461 a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work); 462 } 463 work = a->mult_work; 464 for ( i=0; i<mbs; i++ ) { 465 n = ii[1] - ii[0]; ii++; 466 ncols = n*bs; 467 workt = work; 468 for ( j=0; j<n; j++ ) { 469 xb = x + bs*(*idx++); 470 for ( k=0; k<bs; k++ ) workt[k] = xb[k]; 471 workt += bs; 472 } 473 LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); 474 v += n*bs2; 475 z += bs; 476 } 477 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 478 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 479 PLogFlops(2*a->nz*bs2 - a->m); 480 PetscFunctionReturn(0); 481 } 482 483 #undef __FUNC__ 484 #define __FUNC__ "MatMultAdd_SeqBAIJ_1" 485 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 486 { 487 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 488 Scalar *x,*y,*z,*v,sum; 489 int ierr,mbs=a->mbs,i,*idx,*ii,n; 490 491 PetscFunctionBegin; 492 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 493 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 494 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 495 496 idx = a->j; 497 v = a->a; 498 ii = a->i; 499 500 for ( i=0; i<mbs; i++ ) { 501 n = ii[1] - ii[0]; ii++; 502 sum = y[i]; 503 while (n--) sum += *v++ * x[*idx++]; 504 z[i] = sum; 505 } 506 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 507 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 508 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 509 PLogFlops(2*a->nz); 510 PetscFunctionReturn(0); 511 } 512 513 #undef __FUNC__ 514 #define __FUNC__ "MatMultAdd_SeqBAIJ_2" 515 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 516 { 517 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 518 Scalar *x,*y,*z,*v,*xb,sum1,sum2; 519 Scalar x1,x2; 520 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 521 522 PetscFunctionBegin; 523 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 524 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 525 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 526 527 idx = a->j; 528 v = a->a; 529 ii = a->i; 530 531 for ( i=0; i<mbs; i++ ) { 532 n = ii[1] - ii[0]; ii++; 533 sum1 = y[0]; sum2 = y[1]; 534 for ( j=0; j<n; j++ ) { 535 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 536 sum1 += v[0]*x1 + v[2]*x2; 537 sum2 += v[1]*x1 + v[3]*x2; 538 v += 4; 539 } 540 z[0] = sum1; z[1] = sum2; 541 z += 2; y += 2; 542 } 543 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 544 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 545 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 546 PLogFlops(4*a->nz); 547 PetscFunctionReturn(0); 548 } 549 550 #undef __FUNC__ 551 #define __FUNC__ "MatMultAdd_SeqBAIJ_3" 552 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 553 { 554 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 555 Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3; 556 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 557 558 PetscFunctionBegin; 559 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 560 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 561 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 562 563 idx = a->j; 564 v = a->a; 565 ii = a->i; 566 567 for ( i=0; i<mbs; i++ ) { 568 n = ii[1] - ii[0]; ii++; 569 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 570 for ( j=0; j<n; j++ ) { 571 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 572 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 573 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 574 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 575 v += 9; 576 } 577 z[0] = sum1; z[1] = sum2; z[2] = sum3; 578 z += 3; y += 3; 579 } 580 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 581 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 582 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 583 PLogFlops(18*a->nz); 584 PetscFunctionReturn(0); 585 } 586 587 #undef __FUNC__ 588 #define __FUNC__ "MatMultAdd_SeqBAIJ_4" 589 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 590 { 591 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 592 Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4; 593 int ierr,mbs=a->mbs,i,*idx,*ii; 594 int j,n; 595 596 PetscFunctionBegin; 597 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 598 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 599 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 600 601 idx = a->j; 602 v = a->a; 603 ii = a->i; 604 605 for ( i=0; i<mbs; i++ ) { 606 n = ii[1] - ii[0]; ii++; 607 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 608 for ( j=0; j<n; j++ ) { 609 xb = x + 4*(*idx++); 610 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 611 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 612 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 613 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 614 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 615 v += 16; 616 } 617 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 618 z += 4; y += 4; 619 } 620 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 621 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 622 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 623 PLogFlops(32*a->nz); 624 PetscFunctionReturn(0); 625 } 626 627 #undef __FUNC__ 628 #define __FUNC__ "MatMultAdd_SeqBAIJ_5" 629 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 630 { 631 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 632 Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 633 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 634 635 PetscFunctionBegin; 636 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 637 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 638 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 639 640 idx = a->j; 641 v = a->a; 642 ii = a->i; 643 644 for ( i=0; i<mbs; i++ ) { 645 n = ii[1] - ii[0]; ii++; 646 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 647 for ( j=0; j<n; j++ ) { 648 xb = x + 5*(*idx++); 649 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 650 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 651 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 652 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 653 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 654 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 655 v += 25; 656 } 657 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 658 z += 5; y += 5; 659 } 660 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 661 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 662 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 663 PLogFlops(50*a->nz); 664 PetscFunctionReturn(0); 665 } 666 667 #undef __FUNC__ 668 #define __FUNC__ "MatMultAdd_SeqBAIJ_7" 669 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 670 { 671 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 672 Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 673 Scalar x1,x2,x3,x4,x5,x6,x7; 674 int ierr,mbs=a->mbs,i,*idx,*ii,j,n; 675 676 PetscFunctionBegin; 677 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 678 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 679 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 680 681 idx = a->j; 682 v = a->a; 683 ii = a->i; 684 685 for ( i=0; i<mbs; i++ ) { 686 n = ii[1] - ii[0]; ii++; 687 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 688 for ( j=0; j<n; j++ ) { 689 xb = x + 7*(*idx++); 690 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 691 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 692 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 693 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 694 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 695 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 696 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 697 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 698 v += 49; 699 } 700 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 701 z += 7; y += 7; 702 } 703 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 704 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 705 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 706 PLogFlops(98*a->nz); 707 PetscFunctionReturn(0); 708 } 709 710 711 #undef __FUNC__ 712 #define __FUNC__ "MatMultAdd_SeqBAIJ_N" 713 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 714 { 715 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 716 Scalar *x,*z,*v,*xb; 717 int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr; 718 int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 719 720 PetscFunctionBegin; 721 if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); } 722 723 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 724 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 725 726 idx = a->j; 727 v = a->a; 728 ii = a->i; 729 730 731 if (!a->mult_work) { 732 k = PetscMax(a->m,a->n); 733 a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work); 734 } 735 work = a->mult_work; 736 for ( i=0; i<mbs; i++ ) { 737 n = ii[1] - ii[0]; ii++; 738 ncols = n*bs; 739 workt = work; 740 for ( j=0; j<n; j++ ) { 741 xb = x + bs*(*idx++); 742 for ( k=0; k<bs; k++ ) workt[k] = xb[k]; 743 workt += bs; 744 } 745 LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); 746 v += n*bs2; 747 z += bs; 748 } 749 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 750 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 751 PLogFlops(2*a->nz*bs2 ); 752 PetscFunctionReturn(0); 753 } 754 755 #undef __FUNC__ 756 #define __FUNC__ "MatMultTrans_SeqBAIJ" 757 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz) 758 { 759 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 760 Scalar *xg,*zg,*zb; 761 Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5; 762 int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n; 763 int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 764 765 766 PetscFunctionBegin; 767 ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg; 768 ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg; 769 PetscMemzero(z,N*sizeof(Scalar)); 770 771 idx = a->j; 772 v = a->a; 773 ii = a->i; 774 775 switch (bs) { 776 case 1: 777 for ( i=0; i<mbs; i++ ) { 778 n = ii[1] - ii[0]; ii++; 779 xb = x + i; x1 = xb[0]; 780 ib = idx + ai[i]; 781 for ( j=0; j<n; j++ ) { 782 rval = ib[j]; 783 z[rval] += *v++ * x1; 784 } 785 } 786 break; 787 case 2: 788 for ( i=0; i<mbs; i++ ) { 789 n = ii[1] - ii[0]; ii++; 790 xb = x + 2*i; x1 = xb[0]; x2 = xb[1]; 791 ib = idx + ai[i]; 792 for ( j=0; j<n; j++ ) { 793 rval = ib[j]*2; 794 z[rval++] += v[0]*x1 + v[1]*x2; 795 z[rval++] += v[2]*x1 + v[3]*x2; 796 v += 4; 797 } 798 } 799 break; 800 case 3: 801 for ( i=0; i<mbs; i++ ) { 802 n = ii[1] - ii[0]; ii++; 803 xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 804 ib = idx + ai[i]; 805 for ( j=0; j<n; j++ ) { 806 rval = ib[j]*3; 807 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 808 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 809 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 810 v += 9; 811 } 812 } 813 break; 814 case 4: 815 for ( i=0; i<mbs; i++ ) { 816 n = ii[1] - ii[0]; ii++; 817 xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 818 ib = idx + ai[i]; 819 for ( j=0; j<n; j++ ) { 820 rval = ib[j]*4; 821 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 822 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 823 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 824 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 825 v += 16; 826 } 827 } 828 break; 829 case 5: 830 for ( i=0; i<mbs; i++ ) { 831 n = ii[1] - ii[0]; ii++; 832 xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 833 x4 = xb[3]; x5 = xb[4]; 834 ib = idx + ai[i]; 835 for ( j=0; j<n; j++ ) { 836 rval = ib[j]*5; 837 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 838 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 839 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 840 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 841 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 842 v += 25; 843 } 844 } 845 break; 846 /* block sizes larger then 5 by 5 are handled by BLAS */ 847 default: { 848 int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 849 if (!a->mult_work) { 850 k = PetscMax(a->m,a->n); 851 a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar)); 852 CHKPTRQ(a->mult_work); 853 } 854 work = a->mult_work; 855 for ( i=0; i<mbs; i++ ) { 856 n = ii[1] - ii[0]; ii++; 857 ncols = n*bs; 858 PetscMemzero(work,ncols*sizeof(Scalar)); 859 LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); 860 v += n*bs2; 861 x += bs; 862 workt = work; 863 for ( j=0; j<n; j++ ) { 864 zb = z + bs*(*idx++); 865 for ( k=0; k<bs; k++ ) zb[k] += workt[k] ; 866 workt += bs; 867 } 868 } 869 } 870 } 871 ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr); 872 ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr); 873 PLogFlops(2*a->nz*a->bs2 - a->n); 874 PetscFunctionReturn(0); 875 } 876 877 #undef __FUNC__ 878 #define __FUNC__ "MatMultTransAdd_SeqBAIJ" 879 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 880 881 { 882 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 883 Scalar *xg,*zg,*zb; 884 Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5; 885 int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n; 886 int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr; 887 888 PetscFunctionBegin; 889 ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg; 890 ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg; 891 892 if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); } 893 else PetscMemzero(z,N*sizeof(Scalar)); 894 895 idx = a->j; 896 v = a->a; 897 ii = a->i; 898 899 switch (bs) { 900 case 1: 901 for ( i=0; i<mbs; i++ ) { 902 n = ii[1] - ii[0]; ii++; 903 xb = x + i; x1 = xb[0]; 904 ib = idx + ai[i]; 905 for ( j=0; j<n; j++ ) { 906 rval = ib[j]; 907 z[rval] += *v++ * x1; 908 } 909 } 910 break; 911 case 2: 912 for ( i=0; i<mbs; i++ ) { 913 n = ii[1] - ii[0]; ii++; 914 xb = x + 2*i; x1 = xb[0]; x2 = xb[1]; 915 ib = idx + ai[i]; 916 for ( j=0; j<n; j++ ) { 917 rval = ib[j]*2; 918 z[rval++] += v[0]*x1 + v[1]*x2; 919 z[rval++] += v[2]*x1 + v[3]*x2; 920 v += 4; 921 } 922 } 923 break; 924 case 3: 925 for ( i=0; i<mbs; i++ ) { 926 n = ii[1] - ii[0]; ii++; 927 xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 928 ib = idx + ai[i]; 929 for ( j=0; j<n; j++ ) { 930 rval = ib[j]*3; 931 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 932 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 933 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 934 v += 9; 935 } 936 } 937 break; 938 case 4: 939 for ( i=0; i<mbs; i++ ) { 940 n = ii[1] - ii[0]; ii++; 941 xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 942 ib = idx + ai[i]; 943 for ( j=0; j<n; j++ ) { 944 rval = ib[j]*4; 945 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 946 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 947 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 948 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 949 v += 16; 950 } 951 } 952 break; 953 case 5: 954 for ( i=0; i<mbs; i++ ) { 955 n = ii[1] - ii[0]; ii++; 956 xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 957 x4 = xb[3]; x5 = xb[4]; 958 ib = idx + ai[i]; 959 for ( j=0; j<n; j++ ) { 960 rval = ib[j]*5; 961 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 962 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 963 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 964 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 965 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 966 v += 25; 967 } 968 } 969 break; 970 /* block sizes larger then 5 by 5 are handled by BLAS */ 971 default: { 972 int _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt; 973 if (!a->mult_work) { 974 k = PetscMax(a->m,a->n); 975 a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar)); 976 CHKPTRQ(a->mult_work); 977 } 978 work = a->mult_work; 979 for ( i=0; i<mbs; i++ ) { 980 n = ii[1] - ii[0]; ii++; 981 ncols = n*bs; 982 PetscMemzero(work,ncols*sizeof(Scalar)); 983 LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); 984 v += n*bs2; 985 x += bs; 986 workt = work; 987 for ( j=0; j<n; j++ ) { 988 zb = z + bs*(*idx++); 989 for ( k=0; k<bs; k++ ) zb[k] += workt[k] ; 990 workt += bs; 991 } 992 } 993 } 994 } 995 ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr); 996 ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr); 997 PLogFlops(2*a->nz*a->bs2); 998 PetscFunctionReturn(0); 999 } 1000 1001 #undef __FUNC__ 1002 #define __FUNC__ "MatScale_SeqBAIJ" 1003 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA) 1004 { 1005 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data; 1006 int one = 1, totalnz = a->bs2*a->nz; 1007 1008 PetscFunctionBegin; 1009 BLscal_( &totalnz, alpha, a->a, &one ); 1010 PLogFlops(totalnz); 1011 PetscFunctionReturn(0); 1012 } 1013 1014 #undef __FUNC__ 1015 #define __FUNC__ "MatNorm_SeqBAIJ" 1016 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm) 1017 { 1018 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1019 Scalar *v = a->a; 1020 double sum = 0.0; 1021 int i,nz=a->nz,bs2=a->bs2; 1022 1023 PetscFunctionBegin; 1024 if (type == NORM_FROBENIUS) { 1025 for (i=0; i< bs2*nz; i++ ) { 1026 #if defined(USE_PETSC_COMPLEX) 1027 sum += real(conj(*v)*(*v)); v++; 1028 #else 1029 sum += (*v)*(*v); v++; 1030 #endif 1031 } 1032 *norm = sqrt(sum); 1033 } 1034 else { 1035 SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet"); 1036 } 1037 PetscFunctionReturn(0); 1038 } 1039 1040 1041 #undef __FUNC__ 1042 #define __FUNC__ "MatEqual_SeqBAIJ" 1043 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg) 1044 { 1045 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data; 1046 1047 PetscFunctionBegin; 1048 if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type"); 1049 1050 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1051 if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1052 *flg = PETSC_FALSE; PetscFunctionReturn(0); 1053 } 1054 1055 /* if the a->i are the same */ 1056 if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) { 1057 *flg = PETSC_FALSE; PetscFunctionReturn(0); 1058 } 1059 1060 /* if a->j are the same */ 1061 if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) { 1062 *flg = PETSC_FALSE; PetscFunctionReturn(0); 1063 } 1064 1065 /* if a->a are the same */ 1066 if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) { 1067 *flg = PETSC_FALSE; PetscFunctionReturn(0); 1068 } 1069 *flg = PETSC_TRUE; 1070 PetscFunctionReturn(0); 1071 1072 } 1073 1074 #undef __FUNC__ 1075 #define __FUNC__ "MatGetDiagonal_SeqBAIJ" 1076 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1077 { 1078 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1079 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1080 Scalar *x, zero = 0.0,*aa,*aa_j; 1081 1082 PetscFunctionBegin; 1083 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix"); 1084 bs = a->bs; 1085 aa = a->a; 1086 ai = a->i; 1087 aj = a->j; 1088 ambs = a->mbs; 1089 bs2 = a->bs2; 1090 1091 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1092 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1093 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1094 if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector"); 1095 for ( i=0; i<ambs; i++ ) { 1096 for ( j=ai[i]; j<ai[i+1]; j++ ) { 1097 if (aj[j] == i) { 1098 row = i*bs; 1099 aa_j = aa+j*bs2; 1100 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1101 break; 1102 } 1103 } 1104 } 1105 PetscFunctionReturn(0); 1106 } 1107 1108 #undef __FUNC__ 1109 #define __FUNC__ "MatDiagonalScale_SeqBAIJ" 1110 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 1111 { 1112 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1113 Scalar *l,*r,x,*v,*aa,*li,*ri; 1114 int ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 1115 1116 PetscFunctionBegin; 1117 ai = a->i; 1118 aj = a->j; 1119 aa = a->a; 1120 m = a->m; 1121 n = a->n; 1122 bs = a->bs; 1123 mbs = a->mbs; 1124 bs2 = a->bs2; 1125 if (ll) { 1126 ierr = VecGetArray(ll,&l); CHKERRQ(ierr); 1127 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1128 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length"); 1129 for ( i=0; i<mbs; i++ ) { /* for each block row */ 1130 M = ai[i+1] - ai[i]; 1131 li = l + i*bs; 1132 v = aa + bs2*ai[i]; 1133 for ( j=0; j<M; j++ ) { /* for each block */ 1134 for ( k=0; k<bs2; k++ ) { 1135 (*v++) *= li[k%bs]; 1136 } 1137 } 1138 } 1139 } 1140 1141 if (rr) { 1142 ierr = VecGetArray(rr,&r); CHKERRQ(ierr); 1143 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1144 if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length"); 1145 for ( i=0; i<mbs; i++ ) { /* for each block row */ 1146 M = ai[i+1] - ai[i]; 1147 v = aa + bs2*ai[i]; 1148 for ( j=0; j<M; j++ ) { /* for each block */ 1149 ri = r + bs*aj[ai[i]+j]; 1150 for ( k=0; k<bs; k++ ) { 1151 x = ri[k]; 1152 for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x; 1153 } 1154 } 1155 } 1156 } 1157 PetscFunctionReturn(0); 1158 } 1159 1160 1161 #undef __FUNC__ 1162 #define __FUNC__ "MatGetInfo_SeqBAIJ" 1163 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1164 { 1165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1166 1167 PetscFunctionBegin; 1168 info->rows_global = (double)a->m; 1169 info->columns_global = (double)a->n; 1170 info->rows_local = (double)a->m; 1171 info->columns_local = (double)a->n; 1172 info->block_size = a->bs2; 1173 info->nz_allocated = a->maxnz; 1174 info->nz_used = a->bs2*a->nz; 1175 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1176 /* if (info->nz_unneeded != A->info.nz_unneeded) 1177 printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */ 1178 info->assemblies = A->num_ass; 1179 info->mallocs = a->reallocs; 1180 info->memory = A->mem; 1181 if (A->factor) { 1182 info->fill_ratio_given = A->info.fill_ratio_given; 1183 info->fill_ratio_needed = A->info.fill_ratio_needed; 1184 info->factor_mallocs = A->info.factor_mallocs; 1185 } else { 1186 info->fill_ratio_given = 0; 1187 info->fill_ratio_needed = 0; 1188 info->factor_mallocs = 0; 1189 } 1190 PetscFunctionReturn(0); 1191 } 1192 1193 1194 #undef __FUNC__ 1195 #define __FUNC__ "MatZeroEntries_SeqBAIJ" 1196 int MatZeroEntries_SeqBAIJ(Mat A) 1197 { 1198 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 1199 1200 PetscFunctionBegin; 1201 PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar)); 1202 PetscFunctionReturn(0); 1203 } 1204