1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 27 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n ; ++j){ 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++){ /* for each overlap*/ 50 n = isz; 51 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end ; l++){ 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) 64 nidx2[j*bs+k] = nidx[j]*bs+k; 65 } 66 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 67 } 68 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 69 ierr = PetscFree(nidx);CHKERRQ(ierr); 70 ierr = PetscFree(nidx2);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 #undef __FUNCT__ 75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscTruth flag,sorted; 88 89 PetscFunctionBegin; 90 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 91 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 92 93 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 94 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 97 98 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 99 ssmap = smap; 100 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 101 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 102 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 103 /* determine lens of each row */ 104 for (i=0; i<nrows; i++) { 105 kstart = ai[irow[i]]; 106 kend = kstart + a->ilen[irow[i]]; 107 lens[i] = 0; 108 for (k=kstart; k<kend; k++) { 109 if (ssmap[aj[k]]) { 110 lens[i]++; 111 } 112 } 113 } 114 /* Create and fill new matrix */ 115 if (scall == MAT_REUSE_MATRIX) { 116 c = (Mat_SeqBAIJ *)((*B)->data); 117 118 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 119 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 120 if (!flag) { 121 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 122 } 123 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 124 C = *B; 125 } else { 126 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 127 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 128 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 129 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 130 } 131 c = (Mat_SeqBAIJ *)(C->data); 132 for (i=0; i<nrows; i++) { 133 row = irow[i]; 134 kstart = ai[row]; 135 kend = kstart + a->ilen[row]; 136 mat_i = c->i[i]; 137 mat_j = c->j + mat_i; 138 mat_a = c->a + mat_i*bs2; 139 mat_ilen = c->ilen + i; 140 for (k=kstart; k<kend; k++) { 141 if ((tcol=ssmap[a->j[k]])) { 142 *mat_j++ = tcol - 1; 143 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 144 mat_a += bs2; 145 (*mat_ilen)++; 146 } 147 } 148 } 149 150 /* Free work space */ 151 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 152 ierr = PetscFree(smap);CHKERRQ(ierr); 153 ierr = PetscFree(lens);CHKERRQ(ierr); 154 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 157 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 158 *B = C; 159 PetscFunctionReturn(0); 160 } 161 162 #undef __FUNCT__ 163 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 164 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 165 { 166 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 167 IS is1,is2; 168 PetscErrorCode ierr; 169 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 170 const PetscInt *irow,*icol; 171 172 PetscFunctionBegin; 173 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 174 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 176 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 177 178 /* Verify if the indices corespond to each element in a block 179 and form the IS with compressed IS */ 180 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 181 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 182 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 183 count = 0; 184 for (i=0; i<a->mbs; i++) { 185 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 186 if (vary[i]==bs) iary[count++] = i; 187 } 188 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 189 190 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 191 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 192 count = 0; 193 for (i=0; i<a->mbs; i++) { 194 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc"); 195 if (vary[i]==bs) iary[count++] = i; 196 } 197 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr); 198 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 199 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 200 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 201 202 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 203 ISDestroy(is1); 204 ISDestroy(is2); 205 PetscFunctionReturn(0); 206 } 207 208 #undef __FUNCT__ 209 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 210 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 211 { 212 PetscErrorCode ierr; 213 PetscInt i; 214 215 PetscFunctionBegin; 216 if (scall == MAT_INITIAL_MATRIX) { 217 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 218 } 219 220 for (i=0; i<n; i++) { 221 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 227 /* -------------------------------------------------------*/ 228 /* Should check that shapes of vectors and matrices match */ 229 /* -------------------------------------------------------*/ 230 #include "petscblaslapack.h" 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatMult_SeqBAIJ_1" 234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 235 { 236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 237 PetscScalar *z,sum; 238 const PetscScalar *x; 239 const MatScalar *v; 240 PetscErrorCode ierr; 241 PetscInt mbs,i,n,nonzerorow=0; 242 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 243 PetscTruth usecprow=a->compressedrow.use; 244 245 PetscFunctionBegin; 246 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 247 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 248 249 if (usecprow){ 250 mbs = a->compressedrow.nrows; 251 ii = a->compressedrow.i; 252 ridx = a->compressedrow.rindex; 253 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 254 } else { 255 mbs = a->mbs; 256 ii = a->i; 257 } 258 259 for (i=0; i<mbs; i++) { 260 n = ii[1] - ii[0]; 261 v = a->a + ii[0]; 262 idx = a->j + ii[0]; 263 ii++; 264 sum = 0.0; 265 PetscSparseDensePlusDot(sum,x,v,idx,n); 266 if (usecprow){ 267 z[ridx[i]] = sum; 268 } else { 269 nonzerorow += (n>0); 270 z[i] = sum; 271 } 272 } 273 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 274 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 275 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatMult_SeqBAIJ_2" 281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 284 PetscScalar *z = 0,sum1,sum2,*zarray; 285 const PetscScalar *x,*xb; 286 PetscScalar x1,x2; 287 const MatScalar *v; 288 PetscErrorCode ierr; 289 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 290 PetscTruth usecprow=a->compressedrow.use; 291 292 PetscFunctionBegin; 293 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 294 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 295 296 idx = a->j; 297 v = a->a; 298 if (usecprow){ 299 mbs = a->compressedrow.nrows; 300 ii = a->compressedrow.i; 301 ridx = a->compressedrow.rindex; 302 } else { 303 mbs = a->mbs; 304 ii = a->i; 305 z = zarray; 306 } 307 308 for (i=0; i<mbs; i++) { 309 n = ii[1] - ii[0]; ii++; 310 sum1 = 0.0; sum2 = 0.0; 311 nonzerorow += (n>0); 312 for (j=0; j<n; j++) { 313 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 314 sum1 += v[0]*x1 + v[2]*x2; 315 sum2 += v[1]*x1 + v[3]*x2; 316 v += 4; 317 } 318 if (usecprow) z = zarray + 2*ridx[i]; 319 z[0] = sum1; z[1] = sum2; 320 if (!usecprow) z += 2; 321 } 322 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 323 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 324 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 325 PetscFunctionReturn(0); 326 } 327 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatMult_SeqBAIJ_3" 330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 331 { 332 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 333 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 334 const PetscScalar *x,*xb; 335 const MatScalar *v; 336 PetscErrorCode ierr; 337 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 338 PetscTruth usecprow=a->compressedrow.use; 339 340 341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 342 #pragma disjoint(*v,*z,*xb) 343 #endif 344 345 PetscFunctionBegin; 346 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 347 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 348 349 idx = a->j; 350 v = a->a; 351 if (usecprow){ 352 mbs = a->compressedrow.nrows; 353 ii = a->compressedrow.i; 354 ridx = a->compressedrow.rindex; 355 } else { 356 mbs = a->mbs; 357 ii = a->i; 358 z = zarray; 359 } 360 361 for (i=0; i<mbs; i++) { 362 n = ii[1] - ii[0]; ii++; 363 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 364 nonzerorow += (n>0); 365 for (j=0; j<n; j++) { 366 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 367 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 368 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 369 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 370 v += 9; 371 } 372 if (usecprow) z = zarray + 3*ridx[i]; 373 z[0] = sum1; z[1] = sum2; z[2] = sum3; 374 if (!usecprow) z += 3; 375 } 376 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 377 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 378 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 379 PetscFunctionReturn(0); 380 } 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatMult_SeqBAIJ_4" 384 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 385 { 386 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 387 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 388 const PetscScalar *x,*xb; 389 const MatScalar *v; 390 PetscErrorCode ierr; 391 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 392 PetscTruth usecprow=a->compressedrow.use; 393 394 PetscFunctionBegin; 395 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 396 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 397 398 idx = a->j; 399 v = a->a; 400 if (usecprow){ 401 mbs = a->compressedrow.nrows; 402 ii = a->compressedrow.i; 403 ridx = a->compressedrow.rindex; 404 } else { 405 mbs = a->mbs; 406 ii = a->i; 407 z = zarray; 408 } 409 410 for (i=0; i<mbs; i++) { 411 n = ii[1] - ii[0]; ii++; 412 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 413 nonzerorow += (n>0); 414 for (j=0; j<n; j++) { 415 xb = x + 4*(*idx++); 416 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 417 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 418 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 419 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 420 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 421 v += 16; 422 } 423 if (usecprow) z = zarray + 4*ridx[i]; 424 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 425 if (!usecprow) z += 4; 426 } 427 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 428 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 429 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 430 PetscFunctionReturn(0); 431 } 432 433 #undef __FUNCT__ 434 #define __FUNCT__ "MatMult_SeqBAIJ_5" 435 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 436 { 437 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 438 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 439 const PetscScalar *xb,*x; 440 const MatScalar *v; 441 PetscErrorCode ierr; 442 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 443 PetscInt mbs,i,j,n,nonzerorow=0; 444 PetscTruth usecprow=a->compressedrow.use; 445 446 PetscFunctionBegin; 447 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 448 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 449 450 idx = a->j; 451 v = a->a; 452 if (usecprow){ 453 mbs = a->compressedrow.nrows; 454 ii = a->compressedrow.i; 455 ridx = a->compressedrow.rindex; 456 } else { 457 mbs = a->mbs; 458 ii = a->i; 459 z = zarray; 460 } 461 462 for (i=0; i<mbs; i++) { 463 n = ii[1] - ii[0]; ii++; 464 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 465 nonzerorow += (n>0); 466 for (j=0; j<n; j++) { 467 xb = x + 5*(*idx++); 468 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 469 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 470 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 471 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 472 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 473 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 474 v += 25; 475 } 476 if (usecprow) z = zarray + 5*ridx[i]; 477 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 478 if (!usecprow) z += 5; 479 } 480 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 481 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 482 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 487 #undef __FUNCT__ 488 #define __FUNCT__ "MatMult_SeqBAIJ_6" 489 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 490 { 491 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 492 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 493 const PetscScalar *x,*xb; 494 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 495 const MatScalar *v; 496 PetscErrorCode ierr; 497 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 498 PetscTruth usecprow=a->compressedrow.use; 499 500 PetscFunctionBegin; 501 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 502 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 503 504 idx = a->j; 505 v = a->a; 506 if (usecprow){ 507 mbs = a->compressedrow.nrows; 508 ii = a->compressedrow.i; 509 ridx = a->compressedrow.rindex; 510 } else { 511 mbs = a->mbs; 512 ii = a->i; 513 z = zarray; 514 } 515 516 for (i=0; i<mbs; i++) { 517 n = ii[1] - ii[0]; ii++; 518 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 519 nonzerorow += (n>0); 520 for (j=0; j<n; j++) { 521 xb = x + 6*(*idx++); 522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 523 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 524 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 525 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 526 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 527 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 528 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 529 v += 36; 530 } 531 if (usecprow) z = zarray + 6*ridx[i]; 532 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 533 if (!usecprow) z += 6; 534 } 535 536 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 537 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 538 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 539 PetscFunctionReturn(0); 540 } 541 542 #undef __FUNCT__ 543 #define __FUNCT__ "MatMult_SeqBAIJ_7" 544 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 545 { 546 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 547 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 548 const PetscScalar *x,*xb; 549 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 550 const MatScalar *v; 551 PetscErrorCode ierr; 552 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 553 PetscTruth usecprow=a->compressedrow.use; 554 555 PetscFunctionBegin; 556 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 557 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 558 559 idx = a->j; 560 v = a->a; 561 if (usecprow){ 562 mbs = a->compressedrow.nrows; 563 ii = a->compressedrow.i; 564 ridx = a->compressedrow.rindex; 565 } else { 566 mbs = a->mbs; 567 ii = a->i; 568 z = zarray; 569 } 570 571 for (i=0; i<mbs; i++) { 572 n = ii[1] - ii[0]; ii++; 573 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 574 nonzerorow += (n>0); 575 for (j=0; j<n; j++) { 576 xb = x + 7*(*idx++); 577 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 578 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 579 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 580 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 581 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 582 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 583 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 584 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 585 v += 49; 586 } 587 if (usecprow) z = zarray + 7*ridx[i]; 588 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 589 if (!usecprow) z += 7; 590 } 591 592 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 593 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 594 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 595 PetscFunctionReturn(0); 596 } 597 598 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 599 600 #undef __FUNCT__ 601 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 602 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 603 { 604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 605 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 606 const PetscScalar *x,*xb; 607 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 608 const MatScalar *v; 609 PetscErrorCode ierr; 610 const PetscInt *ii,*ij=a->j,*idx; 611 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 612 PetscTruth usecprow=a->compressedrow.use; 613 614 PetscFunctionBegin; 615 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 616 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 617 618 v = a->a; 619 if (usecprow){ 620 mbs = a->compressedrow.nrows; 621 ii = a->compressedrow.i; 622 ridx = a->compressedrow.rindex; 623 } else { 624 mbs = a->mbs; 625 ii = a->i; 626 z = zarray; 627 } 628 629 for (i=0; i<mbs; i++) { 630 n = ii[i+1] - ii[i]; 631 idx = ij + ii[i]; 632 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 633 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 634 635 nonzerorow += (n>0); 636 for (j=0; j<n; j++) { 637 xb = x + 15*(idx[j]); 638 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 639 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 640 641 for(k=0;k<15;k++){ 642 sum1 += v[0]*xb[k]; 643 sum2 += v[1]*xb[k]; 644 sum3 += v[2]*xb[k]; 645 sum4 += v[3]*xb[k]; 646 sum5 += v[4]*xb[k]; 647 sum6 += v[5]*xb[k]; 648 sum7 += v[6]*xb[k]; 649 sum8 += v[7]*xb[k]; 650 sum9 += v[8]*xb[k]; 651 sum10 += v[9]*xb[k]; 652 sum11 += v[10]*xb[k]; 653 sum12 += v[11]*xb[k]; 654 sum13 += v[12]*xb[k]; 655 sum14 += v[13]*xb[k]; 656 sum15 += v[14]*xb[k]; 657 v += 15; 658 } 659 } 660 if (usecprow) z = zarray + 15*ridx[i]; 661 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 662 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 663 664 if (!usecprow) z += 15; 665 } 666 667 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 668 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 669 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 670 PetscFunctionReturn(0); 671 } 672 673 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 674 #undef __FUNCT__ 675 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 676 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 677 { 678 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 679 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 680 const PetscScalar *x,*xb; 681 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 682 const MatScalar *v; 683 PetscErrorCode ierr; 684 const PetscInt *ii,*ij=a->j,*idx; 685 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 686 PetscTruth usecprow=a->compressedrow.use; 687 688 PetscFunctionBegin; 689 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 690 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 691 692 v = a->a; 693 if (usecprow){ 694 mbs = a->compressedrow.nrows; 695 ii = a->compressedrow.i; 696 ridx = a->compressedrow.rindex; 697 } else { 698 mbs = a->mbs; 699 ii = a->i; 700 z = zarray; 701 } 702 703 for (i=0; i<mbs; i++) { 704 n = ii[i+1] - ii[i]; 705 idx = ij + ii[i]; 706 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 707 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 708 709 nonzerorow += (n>0); 710 for (j=0; j<n; j++) { 711 xb = x + 15*(idx[j]); 712 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 713 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 714 715 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 716 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 717 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 718 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 719 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 720 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 721 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 722 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 723 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 724 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 725 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 726 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 727 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 728 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 729 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 730 731 v += 60; 732 733 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 734 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 735 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 736 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 737 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 738 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 739 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 740 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 741 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 742 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 743 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 744 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 745 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 746 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 747 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 748 v += 60; 749 750 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 751 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 752 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 753 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 754 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 755 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 756 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 757 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 758 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 759 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 760 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 761 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 762 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 763 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 764 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 765 v += 60; 766 767 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 768 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 769 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 770 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 771 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 772 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 773 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 774 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 775 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 776 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 777 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 778 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 779 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 780 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 781 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 782 v += 45; 783 } 784 if (usecprow) z = zarray + 15*ridx[i]; 785 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 786 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 787 788 if (!usecprow) z += 15; 789 } 790 791 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 792 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 793 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 798 #undef __FUNCT__ 799 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 800 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 801 { 802 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 803 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 804 const PetscScalar *x,*xb; 805 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 806 const MatScalar *v; 807 PetscErrorCode ierr; 808 const PetscInt *ii,*ij=a->j,*idx; 809 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 810 PetscTruth usecprow=a->compressedrow.use; 811 812 PetscFunctionBegin; 813 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 814 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 815 816 v = a->a; 817 if (usecprow){ 818 mbs = a->compressedrow.nrows; 819 ii = a->compressedrow.i; 820 ridx = a->compressedrow.rindex; 821 } else { 822 mbs = a->mbs; 823 ii = a->i; 824 z = zarray; 825 } 826 827 for (i=0; i<mbs; i++) { 828 n = ii[i+1] - ii[i]; 829 idx = ij + ii[i]; 830 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 831 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 832 833 nonzerorow += (n>0); 834 for (j=0; j<n; j++) { 835 xb = x + 15*(idx[j]); 836 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 837 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 838 839 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 840 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 841 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 842 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 843 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 844 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 845 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 846 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 847 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 848 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 849 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 850 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 851 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 852 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 853 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 854 v += 120; 855 856 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 857 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 858 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 859 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 860 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 861 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 862 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 863 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 864 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 865 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 866 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 867 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 868 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 869 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 870 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 871 v += 105; 872 } 873 if (usecprow) z = zarray + 15*ridx[i]; 874 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 875 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 876 877 if (!usecprow) z += 15; 878 } 879 880 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 881 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 882 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 883 PetscFunctionReturn(0); 884 } 885 886 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 887 888 #undef __FUNCT__ 889 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 890 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 891 { 892 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 893 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 894 const PetscScalar *x,*xb; 895 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 896 const MatScalar *v; 897 PetscErrorCode ierr; 898 const PetscInt *ii,*ij=a->j,*idx; 899 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 900 PetscTruth usecprow=a->compressedrow.use; 901 902 PetscFunctionBegin; 903 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 904 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 905 906 v = a->a; 907 if (usecprow){ 908 mbs = a->compressedrow.nrows; 909 ii = a->compressedrow.i; 910 ridx = a->compressedrow.rindex; 911 } else { 912 mbs = a->mbs; 913 ii = a->i; 914 z = zarray; 915 } 916 917 for (i=0; i<mbs; i++) { 918 n = ii[i+1] - ii[i]; 919 idx = ij + ii[i]; 920 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 921 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 922 923 nonzerorow += (n>0); 924 for (j=0; j<n; j++) { 925 xb = x + 15*(idx[j]); 926 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 927 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 928 929 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 930 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 931 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 932 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 933 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 934 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 935 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 936 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 937 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 938 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 939 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 940 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 941 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 942 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 943 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 944 v += 225; 945 } 946 if (usecprow) z = zarray + 15*ridx[i]; 947 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 948 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 949 950 if (!usecprow) z += 15; 951 } 952 953 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 954 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 955 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 956 PetscFunctionReturn(0); 957 } 958 959 960 /* 961 This will not work with MatScalar == float because it calls the BLAS 962 */ 963 #undef __FUNCT__ 964 #define __FUNCT__ "MatMult_SeqBAIJ_N" 965 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 966 { 967 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 968 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 969 MatScalar *v; 970 PetscErrorCode ierr; 971 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 972 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 973 PetscTruth usecprow=a->compressedrow.use; 974 975 PetscFunctionBegin; 976 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 977 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 978 979 idx = a->j; 980 v = a->a; 981 if (usecprow){ 982 mbs = a->compressedrow.nrows; 983 ii = a->compressedrow.i; 984 ridx = a->compressedrow.rindex; 985 } else { 986 mbs = a->mbs; 987 ii = a->i; 988 z = zarray; 989 } 990 991 if (!a->mult_work) { 992 k = PetscMax(A->rmap->n,A->cmap->n); 993 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 994 } 995 work = a->mult_work; 996 for (i=0; i<mbs; i++) { 997 n = ii[1] - ii[0]; ii++; 998 ncols = n*bs; 999 workt = work; 1000 nonzerorow += (n>0); 1001 for (j=0; j<n; j++) { 1002 xb = x + bs*(*idx++); 1003 for (k=0; k<bs; k++) workt[k] = xb[k]; 1004 workt += bs; 1005 } 1006 if (usecprow) z = zarray + bs*ridx[i]; 1007 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1008 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1009 v += n*bs2; 1010 if (!usecprow) z += bs; 1011 } 1012 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1013 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1014 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1015 PetscFunctionReturn(0); 1016 } 1017 1018 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 1019 #undef __FUNCT__ 1020 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1021 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1022 { 1023 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1024 const PetscScalar *x; 1025 PetscScalar *y,*z,sum; 1026 const MatScalar *v; 1027 PetscErrorCode ierr; 1028 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1029 const PetscInt *idx,*ii; 1030 PetscTruth usecprow=a->compressedrow.use; 1031 1032 PetscFunctionBegin; 1033 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1034 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1035 if (zz != yy) { 1036 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1037 } else { 1038 z = y; 1039 } 1040 1041 idx = a->j; 1042 v = a->a; 1043 if (usecprow){ 1044 if (zz != yy){ 1045 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1046 } 1047 mbs = a->compressedrow.nrows; 1048 ii = a->compressedrow.i; 1049 ridx = a->compressedrow.rindex; 1050 } else { 1051 ii = a->i; 1052 } 1053 1054 for (i=0; i<mbs; i++) { 1055 n = ii[1] - ii[0]; 1056 ii++; 1057 if (!usecprow){ 1058 nonzerorow += (n>0); 1059 sum = y[i]; 1060 } else { 1061 sum = y[ridx[i]]; 1062 } 1063 PetscSparseDensePlusDot(sum,x,v,idx,n); 1064 v += n; 1065 idx += n; 1066 if (usecprow){ 1067 z[ridx[i]] = sum; 1068 } else { 1069 z[i] = sum; 1070 } 1071 } 1072 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1073 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1074 if (zz != yy) { 1075 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1076 } 1077 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1078 PetscFunctionReturn(0); 1079 } 1080 1081 #undef __FUNCT__ 1082 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1083 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1084 { 1085 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1086 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1087 PetscScalar x1,x2,*yarray,*zarray; 1088 MatScalar *v; 1089 PetscErrorCode ierr; 1090 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1091 PetscTruth usecprow=a->compressedrow.use; 1092 1093 PetscFunctionBegin; 1094 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1095 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1096 if (zz != yy) { 1097 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1098 } else { 1099 zarray = yarray; 1100 } 1101 1102 idx = a->j; 1103 v = a->a; 1104 if (usecprow){ 1105 if (zz != yy){ 1106 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1107 } 1108 mbs = a->compressedrow.nrows; 1109 ii = a->compressedrow.i; 1110 ridx = a->compressedrow.rindex; 1111 if (zz != yy){ 1112 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1113 } 1114 } else { 1115 ii = a->i; 1116 y = yarray; 1117 z = zarray; 1118 } 1119 1120 for (i=0; i<mbs; i++) { 1121 n = ii[1] - ii[0]; ii++; 1122 if (usecprow){ 1123 z = zarray + 2*ridx[i]; 1124 y = yarray + 2*ridx[i]; 1125 } 1126 sum1 = y[0]; sum2 = y[1]; 1127 for (j=0; j<n; j++) { 1128 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1129 sum1 += v[0]*x1 + v[2]*x2; 1130 sum2 += v[1]*x1 + v[3]*x2; 1131 v += 4; 1132 } 1133 z[0] = sum1; z[1] = sum2; 1134 if (!usecprow){ 1135 z += 2; y += 2; 1136 } 1137 } 1138 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1139 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1140 if (zz != yy) { 1141 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1142 } 1143 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1144 PetscFunctionReturn(0); 1145 } 1146 1147 #undef __FUNCT__ 1148 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1149 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1150 { 1151 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1152 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1153 MatScalar *v; 1154 PetscErrorCode ierr; 1155 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1156 PetscTruth usecprow=a->compressedrow.use; 1157 1158 PetscFunctionBegin; 1159 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1160 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1161 if (zz != yy) { 1162 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1163 } else { 1164 zarray = yarray; 1165 } 1166 1167 idx = a->j; 1168 v = a->a; 1169 if (usecprow){ 1170 if (zz != yy){ 1171 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1172 } 1173 mbs = a->compressedrow.nrows; 1174 ii = a->compressedrow.i; 1175 ridx = a->compressedrow.rindex; 1176 } else { 1177 ii = a->i; 1178 y = yarray; 1179 z = zarray; 1180 } 1181 1182 for (i=0; i<mbs; i++) { 1183 n = ii[1] - ii[0]; ii++; 1184 if (usecprow){ 1185 z = zarray + 3*ridx[i]; 1186 y = yarray + 3*ridx[i]; 1187 } 1188 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1189 for (j=0; j<n; j++) { 1190 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1191 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1192 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1193 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1194 v += 9; 1195 } 1196 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1197 if (!usecprow){ 1198 z += 3; y += 3; 1199 } 1200 } 1201 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1202 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1203 if (zz != yy) { 1204 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1205 } 1206 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1207 PetscFunctionReturn(0); 1208 } 1209 1210 #undef __FUNCT__ 1211 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1212 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1213 { 1214 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1215 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1216 MatScalar *v; 1217 PetscErrorCode ierr; 1218 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1219 PetscTruth usecprow=a->compressedrow.use; 1220 1221 PetscFunctionBegin; 1222 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1223 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1224 if (zz != yy) { 1225 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1226 } else { 1227 zarray = yarray; 1228 } 1229 1230 idx = a->j; 1231 v = a->a; 1232 if (usecprow){ 1233 if (zz != yy){ 1234 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1235 } 1236 mbs = a->compressedrow.nrows; 1237 ii = a->compressedrow.i; 1238 ridx = a->compressedrow.rindex; 1239 } else { 1240 ii = a->i; 1241 y = yarray; 1242 z = zarray; 1243 } 1244 1245 for (i=0; i<mbs; i++) { 1246 n = ii[1] - ii[0]; ii++; 1247 if (usecprow){ 1248 z = zarray + 4*ridx[i]; 1249 y = yarray + 4*ridx[i]; 1250 } 1251 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1252 for (j=0; j<n; j++) { 1253 xb = x + 4*(*idx++); 1254 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1255 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1256 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1257 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1258 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1259 v += 16; 1260 } 1261 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1262 if (!usecprow){ 1263 z += 4; y += 4; 1264 } 1265 } 1266 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1267 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1268 if (zz != yy) { 1269 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1270 } 1271 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1272 PetscFunctionReturn(0); 1273 } 1274 1275 #undef __FUNCT__ 1276 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1277 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1278 { 1279 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1280 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1281 PetscScalar *yarray,*zarray; 1282 MatScalar *v; 1283 PetscErrorCode ierr; 1284 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1285 PetscTruth usecprow=a->compressedrow.use; 1286 1287 PetscFunctionBegin; 1288 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1289 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1290 if (zz != yy) { 1291 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1292 } else { 1293 zarray = yarray; 1294 } 1295 1296 idx = a->j; 1297 v = a->a; 1298 if (usecprow){ 1299 if (zz != yy){ 1300 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1301 } 1302 mbs = a->compressedrow.nrows; 1303 ii = a->compressedrow.i; 1304 ridx = a->compressedrow.rindex; 1305 } else { 1306 ii = a->i; 1307 y = yarray; 1308 z = zarray; 1309 } 1310 1311 for (i=0; i<mbs; i++) { 1312 n = ii[1] - ii[0]; ii++; 1313 if (usecprow){ 1314 z = zarray + 5*ridx[i]; 1315 y = yarray + 5*ridx[i]; 1316 } 1317 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1318 for (j=0; j<n; j++) { 1319 xb = x + 5*(*idx++); 1320 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1321 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1322 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1323 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1324 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1325 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1326 v += 25; 1327 } 1328 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1329 if (!usecprow){ 1330 z += 5; y += 5; 1331 } 1332 } 1333 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1334 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1335 if (zz != yy) { 1336 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1337 } 1338 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1339 PetscFunctionReturn(0); 1340 } 1341 #undef __FUNCT__ 1342 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1343 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1344 { 1345 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1346 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1347 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1348 MatScalar *v; 1349 PetscErrorCode ierr; 1350 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1351 PetscTruth usecprow=a->compressedrow.use; 1352 1353 PetscFunctionBegin; 1354 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1355 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1356 if (zz != yy) { 1357 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1358 } else { 1359 zarray = yarray; 1360 } 1361 1362 idx = a->j; 1363 v = a->a; 1364 if (usecprow){ 1365 if (zz != yy){ 1366 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1367 } 1368 mbs = a->compressedrow.nrows; 1369 ii = a->compressedrow.i; 1370 ridx = a->compressedrow.rindex; 1371 } else { 1372 ii = a->i; 1373 y = yarray; 1374 z = zarray; 1375 } 1376 1377 for (i=0; i<mbs; i++) { 1378 n = ii[1] - ii[0]; ii++; 1379 if (usecprow){ 1380 z = zarray + 6*ridx[i]; 1381 y = yarray + 6*ridx[i]; 1382 } 1383 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1384 for (j=0; j<n; j++) { 1385 xb = x + 6*(*idx++); 1386 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1387 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1388 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1389 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1390 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1391 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1392 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1393 v += 36; 1394 } 1395 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1396 if (!usecprow){ 1397 z += 6; y += 6; 1398 } 1399 } 1400 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1401 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1402 if (zz != yy) { 1403 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1404 } 1405 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1406 PetscFunctionReturn(0); 1407 } 1408 1409 #undef __FUNCT__ 1410 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1411 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1412 { 1413 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1414 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1415 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1416 MatScalar *v; 1417 PetscErrorCode ierr; 1418 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1419 PetscTruth usecprow=a->compressedrow.use; 1420 1421 PetscFunctionBegin; 1422 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1423 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1424 if (zz != yy) { 1425 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1426 } else { 1427 zarray = yarray; 1428 } 1429 1430 idx = a->j; 1431 v = a->a; 1432 if (usecprow){ 1433 if (zz != yy){ 1434 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1435 } 1436 mbs = a->compressedrow.nrows; 1437 ii = a->compressedrow.i; 1438 ridx = a->compressedrow.rindex; 1439 } else { 1440 ii = a->i; 1441 y = yarray; 1442 z = zarray; 1443 } 1444 1445 for (i=0; i<mbs; i++) { 1446 n = ii[1] - ii[0]; ii++; 1447 if (usecprow){ 1448 z = zarray + 7*ridx[i]; 1449 y = yarray + 7*ridx[i]; 1450 } 1451 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1452 for (j=0; j<n; j++) { 1453 xb = x + 7*(*idx++); 1454 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1455 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1456 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1457 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1458 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1459 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1460 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1461 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1462 v += 49; 1463 } 1464 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1465 if (!usecprow){ 1466 z += 7; y += 7; 1467 } 1468 } 1469 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1470 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1471 if (zz != yy) { 1472 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1473 } 1474 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1475 PetscFunctionReturn(0); 1476 } 1477 1478 #undef __FUNCT__ 1479 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1480 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1481 { 1482 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1483 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1484 MatScalar *v; 1485 PetscErrorCode ierr; 1486 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1487 PetscInt ncols,k,*ridx=PETSC_NULL; 1488 PetscTruth usecprow=a->compressedrow.use; 1489 1490 PetscFunctionBegin; 1491 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1492 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1493 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1494 1495 idx = a->j; 1496 v = a->a; 1497 if (usecprow){ 1498 mbs = a->compressedrow.nrows; 1499 ii = a->compressedrow.i; 1500 ridx = a->compressedrow.rindex; 1501 } else { 1502 mbs = a->mbs; 1503 ii = a->i; 1504 z = zarray; 1505 } 1506 1507 if (!a->mult_work) { 1508 k = PetscMax(A->rmap->n,A->cmap->n); 1509 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1510 } 1511 work = a->mult_work; 1512 for (i=0; i<mbs; i++) { 1513 n = ii[1] - ii[0]; ii++; 1514 ncols = n*bs; 1515 workt = work; 1516 for (j=0; j<n; j++) { 1517 xb = x + bs*(*idx++); 1518 for (k=0; k<bs; k++) workt[k] = xb[k]; 1519 workt += bs; 1520 } 1521 if (usecprow) z = zarray + bs*ridx[i]; 1522 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1523 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1524 v += n*bs2; 1525 if (!usecprow){ 1526 z += bs; 1527 } 1528 } 1529 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1530 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1531 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1532 PetscFunctionReturn(0); 1533 } 1534 1535 #undef __FUNCT__ 1536 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1537 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1538 { 1539 PetscScalar zero = 0.0; 1540 PetscErrorCode ierr; 1541 1542 PetscFunctionBegin; 1543 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1544 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1545 PetscFunctionReturn(0); 1546 } 1547 1548 #undef __FUNCT__ 1549 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1550 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1551 { 1552 PetscScalar zero = 0.0; 1553 PetscErrorCode ierr; 1554 1555 PetscFunctionBegin; 1556 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1557 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1558 PetscFunctionReturn(0); 1559 } 1560 1561 #undef __FUNCT__ 1562 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1563 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1564 1565 { 1566 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1567 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1568 MatScalar *v; 1569 PetscErrorCode ierr; 1570 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1571 Mat_CompressedRow cprow = a->compressedrow; 1572 PetscTruth usecprow=cprow.use; 1573 1574 PetscFunctionBegin; 1575 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1576 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1577 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1578 1579 idx = a->j; 1580 v = a->a; 1581 if (usecprow){ 1582 mbs = cprow.nrows; 1583 ii = cprow.i; 1584 ridx = cprow.rindex; 1585 } else { 1586 mbs=a->mbs; 1587 ii = a->i; 1588 xb = x; 1589 } 1590 1591 switch (bs) { 1592 case 1: 1593 for (i=0; i<mbs; i++) { 1594 if (usecprow) xb = x + ridx[i]; 1595 x1 = xb[0]; 1596 ib = idx + ii[0]; 1597 n = ii[1] - ii[0]; ii++; 1598 for (j=0; j<n; j++) { 1599 rval = ib[j]; 1600 z[rval] += PetscConj(*v) * x1; 1601 v++; 1602 } 1603 if (!usecprow) xb++; 1604 } 1605 break; 1606 case 2: 1607 for (i=0; i<mbs; i++) { 1608 if (usecprow) xb = x + 2*ridx[i]; 1609 x1 = xb[0]; x2 = xb[1]; 1610 ib = idx + ii[0]; 1611 n = ii[1] - ii[0]; ii++; 1612 for (j=0; j<n; j++) { 1613 rval = ib[j]*2; 1614 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1615 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1616 v += 4; 1617 } 1618 if (!usecprow) xb += 2; 1619 } 1620 break; 1621 case 3: 1622 for (i=0; i<mbs; i++) { 1623 if (usecprow) xb = x + 3*ridx[i]; 1624 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1625 ib = idx + ii[0]; 1626 n = ii[1] - ii[0]; ii++; 1627 for (j=0; j<n; j++) { 1628 rval = ib[j]*3; 1629 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1630 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1631 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1632 v += 9; 1633 } 1634 if (!usecprow) xb += 3; 1635 } 1636 break; 1637 case 4: 1638 for (i=0; i<mbs; i++) { 1639 if (usecprow) xb = x + 4*ridx[i]; 1640 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1641 ib = idx + ii[0]; 1642 n = ii[1] - ii[0]; ii++; 1643 for (j=0; j<n; j++) { 1644 rval = ib[j]*4; 1645 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1646 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1647 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1648 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1649 v += 16; 1650 } 1651 if (!usecprow) xb += 4; 1652 } 1653 break; 1654 case 5: 1655 for (i=0; i<mbs; i++) { 1656 if (usecprow) xb = x + 5*ridx[i]; 1657 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1658 x4 = xb[3]; x5 = xb[4]; 1659 ib = idx + ii[0]; 1660 n = ii[1] - ii[0]; ii++; 1661 for (j=0; j<n; j++) { 1662 rval = ib[j]*5; 1663 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1664 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1665 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1666 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1667 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1668 v += 25; 1669 } 1670 if (!usecprow) xb += 5; 1671 } 1672 break; 1673 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1674 PetscInt ncols,k; 1675 PetscScalar *work,*workt,*xtmp; 1676 1677 SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1678 if (!a->mult_work) { 1679 k = PetscMax(A->rmap->n,A->cmap->n); 1680 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1681 } 1682 work = a->mult_work; 1683 xtmp = x; 1684 for (i=0; i<mbs; i++) { 1685 n = ii[1] - ii[0]; ii++; 1686 ncols = n*bs; 1687 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1688 if (usecprow) { 1689 xtmp = x + bs*ridx[i]; 1690 } 1691 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1692 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1693 v += n*bs2; 1694 if (!usecprow) xtmp += bs; 1695 workt = work; 1696 for (j=0; j<n; j++) { 1697 zb = z + bs*(*idx++); 1698 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1699 workt += bs; 1700 } 1701 } 1702 } 1703 } 1704 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1705 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1706 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1707 PetscFunctionReturn(0); 1708 } 1709 1710 #undef __FUNCT__ 1711 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1712 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1713 { 1714 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1715 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1716 MatScalar *v; 1717 PetscErrorCode ierr; 1718 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1719 Mat_CompressedRow cprow = a->compressedrow; 1720 PetscTruth usecprow=cprow.use; 1721 1722 PetscFunctionBegin; 1723 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1724 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1725 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1726 1727 idx = a->j; 1728 v = a->a; 1729 if (usecprow){ 1730 mbs = cprow.nrows; 1731 ii = cprow.i; 1732 ridx = cprow.rindex; 1733 } else { 1734 mbs=a->mbs; 1735 ii = a->i; 1736 xb = x; 1737 } 1738 1739 switch (bs) { 1740 case 1: 1741 for (i=0; i<mbs; i++) { 1742 if (usecprow) xb = x + ridx[i]; 1743 x1 = xb[0]; 1744 ib = idx + ii[0]; 1745 n = ii[1] - ii[0]; ii++; 1746 for (j=0; j<n; j++) { 1747 rval = ib[j]; 1748 z[rval] += *v * x1; 1749 v++; 1750 } 1751 if (!usecprow) xb++; 1752 } 1753 break; 1754 case 2: 1755 for (i=0; i<mbs; i++) { 1756 if (usecprow) xb = x + 2*ridx[i]; 1757 x1 = xb[0]; x2 = xb[1]; 1758 ib = idx + ii[0]; 1759 n = ii[1] - ii[0]; ii++; 1760 for (j=0; j<n; j++) { 1761 rval = ib[j]*2; 1762 z[rval++] += v[0]*x1 + v[1]*x2; 1763 z[rval++] += v[2]*x1 + v[3]*x2; 1764 v += 4; 1765 } 1766 if (!usecprow) xb += 2; 1767 } 1768 break; 1769 case 3: 1770 for (i=0; i<mbs; i++) { 1771 if (usecprow) xb = x + 3*ridx[i]; 1772 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1773 ib = idx + ii[0]; 1774 n = ii[1] - ii[0]; ii++; 1775 for (j=0; j<n; j++) { 1776 rval = ib[j]*3; 1777 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1778 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1779 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1780 v += 9; 1781 } 1782 if (!usecprow) xb += 3; 1783 } 1784 break; 1785 case 4: 1786 for (i=0; i<mbs; i++) { 1787 if (usecprow) xb = x + 4*ridx[i]; 1788 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1789 ib = idx + ii[0]; 1790 n = ii[1] - ii[0]; ii++; 1791 for (j=0; j<n; j++) { 1792 rval = ib[j]*4; 1793 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1794 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1795 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1796 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1797 v += 16; 1798 } 1799 if (!usecprow) xb += 4; 1800 } 1801 break; 1802 case 5: 1803 for (i=0; i<mbs; i++) { 1804 if (usecprow) xb = x + 5*ridx[i]; 1805 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1806 x4 = xb[3]; x5 = xb[4]; 1807 ib = idx + ii[0]; 1808 n = ii[1] - ii[0]; ii++; 1809 for (j=0; j<n; j++) { 1810 rval = ib[j]*5; 1811 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1812 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1813 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1814 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1815 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1816 v += 25; 1817 } 1818 if (!usecprow) xb += 5; 1819 } 1820 break; 1821 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1822 PetscInt ncols,k; 1823 PetscScalar *work,*workt,*xtmp; 1824 1825 if (!a->mult_work) { 1826 k = PetscMax(A->rmap->n,A->cmap->n); 1827 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1828 } 1829 work = a->mult_work; 1830 xtmp = x; 1831 for (i=0; i<mbs; i++) { 1832 n = ii[1] - ii[0]; ii++; 1833 ncols = n*bs; 1834 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1835 if (usecprow) { 1836 xtmp = x + bs*ridx[i]; 1837 } 1838 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1839 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1840 v += n*bs2; 1841 if (!usecprow) xtmp += bs; 1842 workt = work; 1843 for (j=0; j<n; j++) { 1844 zb = z + bs*(*idx++); 1845 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1846 workt += bs; 1847 } 1848 } 1849 } 1850 } 1851 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1852 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1853 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1854 PetscFunctionReturn(0); 1855 } 1856 1857 #undef __FUNCT__ 1858 #define __FUNCT__ "MatScale_SeqBAIJ" 1859 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1860 { 1861 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1862 PetscInt totalnz = a->bs2*a->nz; 1863 PetscScalar oalpha = alpha; 1864 PetscErrorCode ierr; 1865 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1866 1867 PetscFunctionBegin; 1868 BLASscal_(&tnz,&oalpha,a->a,&one); 1869 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1870 PetscFunctionReturn(0); 1871 } 1872 1873 #undef __FUNCT__ 1874 #define __FUNCT__ "MatNorm_SeqBAIJ" 1875 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1876 { 1877 PetscErrorCode ierr; 1878 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1879 MatScalar *v = a->a; 1880 PetscReal sum = 0.0; 1881 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1882 1883 PetscFunctionBegin; 1884 if (type == NORM_FROBENIUS) { 1885 for (i=0; i< bs2*nz; i++) { 1886 #if defined(PETSC_USE_COMPLEX) 1887 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1888 #else 1889 sum += (*v)*(*v); v++; 1890 #endif 1891 } 1892 *norm = sqrt(sum); 1893 } else if (type == NORM_1) { /* maximum column sum */ 1894 PetscReal *tmp; 1895 PetscInt *bcol = a->j; 1896 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1897 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1898 for (i=0; i<nz; i++){ 1899 for (j=0; j<bs; j++){ 1900 k1 = bs*(*bcol) + j; /* column index */ 1901 for (k=0; k<bs; k++){ 1902 tmp[k1] += PetscAbsScalar(*v); v++; 1903 } 1904 } 1905 bcol++; 1906 } 1907 *norm = 0.0; 1908 for (j=0; j<A->cmap->n; j++) { 1909 if (tmp[j] > *norm) *norm = tmp[j]; 1910 } 1911 ierr = PetscFree(tmp);CHKERRQ(ierr); 1912 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1913 *norm = 0.0; 1914 for (k=0; k<bs; k++) { 1915 for (j=0; j<a->mbs; j++) { 1916 v = a->a + bs2*a->i[j] + k; 1917 sum = 0.0; 1918 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1919 for (k1=0; k1<bs; k1++){ 1920 sum += PetscAbsScalar(*v); 1921 v += bs; 1922 } 1923 } 1924 if (sum > *norm) *norm = sum; 1925 } 1926 } 1927 } else { 1928 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1929 } 1930 PetscFunctionReturn(0); 1931 } 1932 1933 1934 #undef __FUNCT__ 1935 #define __FUNCT__ "MatEqual_SeqBAIJ" 1936 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg) 1937 { 1938 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1939 PetscErrorCode ierr; 1940 1941 PetscFunctionBegin; 1942 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1943 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1944 *flg = PETSC_FALSE; 1945 PetscFunctionReturn(0); 1946 } 1947 1948 /* if the a->i are the same */ 1949 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1950 if (!*flg) { 1951 PetscFunctionReturn(0); 1952 } 1953 1954 /* if a->j are the same */ 1955 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1956 if (!*flg) { 1957 PetscFunctionReturn(0); 1958 } 1959 /* if a->a are the same */ 1960 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1961 PetscFunctionReturn(0); 1962 1963 } 1964 1965 #undef __FUNCT__ 1966 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1967 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1968 { 1969 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1970 PetscErrorCode ierr; 1971 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1972 PetscScalar *x,zero = 0.0; 1973 MatScalar *aa,*aa_j; 1974 1975 PetscFunctionBegin; 1976 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1977 bs = A->rmap->bs; 1978 aa = a->a; 1979 ai = a->i; 1980 aj = a->j; 1981 ambs = a->mbs; 1982 bs2 = a->bs2; 1983 1984 ierr = VecSet(v,zero);CHKERRQ(ierr); 1985 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1986 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1987 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1988 for (i=0; i<ambs; i++) { 1989 for (j=ai[i]; j<ai[i+1]; j++) { 1990 if (aj[j] == i) { 1991 row = i*bs; 1992 aa_j = aa+j*bs2; 1993 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1994 break; 1995 } 1996 } 1997 } 1998 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1999 PetscFunctionReturn(0); 2000 } 2001 2002 #undef __FUNCT__ 2003 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2004 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2005 { 2006 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2007 PetscScalar *l,*r,x,*li,*ri; 2008 MatScalar *aa,*v; 2009 PetscErrorCode ierr; 2010 PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 2011 2012 PetscFunctionBegin; 2013 ai = a->i; 2014 aj = a->j; 2015 aa = a->a; 2016 m = A->rmap->n; 2017 n = A->cmap->n; 2018 bs = A->rmap->bs; 2019 mbs = a->mbs; 2020 bs2 = a->bs2; 2021 if (ll) { 2022 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 2023 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2024 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2025 for (i=0; i<mbs; i++) { /* for each block row */ 2026 M = ai[i+1] - ai[i]; 2027 li = l + i*bs; 2028 v = aa + bs2*ai[i]; 2029 for (j=0; j<M; j++) { /* for each block */ 2030 for (k=0; k<bs2; k++) { 2031 (*v++) *= li[k%bs]; 2032 } 2033 } 2034 } 2035 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 2036 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2037 } 2038 2039 if (rr) { 2040 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 2041 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2042 if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2043 for (i=0; i<mbs; i++) { /* for each block row */ 2044 M = ai[i+1] - ai[i]; 2045 v = aa + bs2*ai[i]; 2046 for (j=0; j<M; j++) { /* for each block */ 2047 ri = r + bs*aj[ai[i]+j]; 2048 for (k=0; k<bs; k++) { 2049 x = ri[k]; 2050 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 2051 } 2052 } 2053 } 2054 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 2055 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2056 } 2057 PetscFunctionReturn(0); 2058 } 2059 2060 2061 #undef __FUNCT__ 2062 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2063 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2064 { 2065 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2066 2067 PetscFunctionBegin; 2068 info->block_size = a->bs2; 2069 info->nz_allocated = a->maxnz; 2070 info->nz_used = a->bs2*a->nz; 2071 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2072 info->assemblies = A->num_ass; 2073 info->mallocs = a->reallocs; 2074 info->memory = ((PetscObject)A)->mem; 2075 if (A->factor) { 2076 info->fill_ratio_given = A->info.fill_ratio_given; 2077 info->fill_ratio_needed = A->info.fill_ratio_needed; 2078 info->factor_mallocs = A->info.factor_mallocs; 2079 } else { 2080 info->fill_ratio_given = 0; 2081 info->fill_ratio_needed = 0; 2082 info->factor_mallocs = 0; 2083 } 2084 PetscFunctionReturn(0); 2085 } 2086 2087 2088 #undef __FUNCT__ 2089 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2090 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2091 { 2092 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2093 PetscErrorCode ierr; 2094 2095 PetscFunctionBegin; 2096 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2097 PetscFunctionReturn(0); 2098 } 2099