1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 27 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n; ++j) { 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap*/ 50 n = isz; 51 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end; l++) { 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 64 } 65 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 66 } 67 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 68 ierr = PetscFree(nidx);CHKERRQ(ierr); 69 ierr = PetscFree(nidx2);CHKERRQ(ierr); 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 76 { 77 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 78 PetscErrorCode ierr; 79 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 80 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 81 const PetscInt *irow,*icol; 82 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 83 PetscInt *aj = a->j,*ai = a->i; 84 MatScalar *mat_a; 85 Mat C; 86 PetscBool flag,sorted; 87 88 PetscFunctionBegin; 89 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 90 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 91 92 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 93 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 94 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 96 97 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 98 ssmap = smap; 99 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 100 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 101 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 102 /* determine lens of each row */ 103 for (i=0; i<nrows; i++) { 104 kstart = ai[irow[i]]; 105 kend = kstart + a->ilen[irow[i]]; 106 lens[i] = 0; 107 for (k=kstart; k<kend; k++) { 108 if (ssmap[aj[k]]) lens[i]++; 109 } 110 } 111 /* Create and fill new matrix */ 112 if (scall == MAT_REUSE_MATRIX) { 113 c = (Mat_SeqBAIJ*)((*B)->data); 114 115 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 116 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 117 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 118 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 119 C = *B; 120 } else { 121 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 122 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 123 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 124 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 125 } 126 c = (Mat_SeqBAIJ*)(C->data); 127 for (i=0; i<nrows; i++) { 128 row = irow[i]; 129 kstart = ai[row]; 130 kend = kstart + a->ilen[row]; 131 mat_i = c->i[i]; 132 mat_j = c->j + mat_i; 133 mat_a = c->a + mat_i*bs2; 134 mat_ilen = c->ilen + i; 135 for (k=kstart; k<kend; k++) { 136 if ((tcol=ssmap[a->j[k]])) { 137 *mat_j++ = tcol - 1; 138 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 139 mat_a += bs2; 140 (*mat_ilen)++; 141 } 142 } 143 } 144 145 /* Free work space */ 146 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 147 ierr = PetscFree(smap);CHKERRQ(ierr); 148 ierr = PetscFree(lens);CHKERRQ(ierr); 149 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 150 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 151 152 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 153 *B = C; 154 PetscFunctionReturn(0); 155 } 156 157 #undef __FUNCT__ 158 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 159 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 160 { 161 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 162 IS is1,is2; 163 PetscErrorCode ierr; 164 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 165 const PetscInt *irow,*icol; 166 167 PetscFunctionBegin; 168 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 169 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 170 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 171 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 172 173 /* Verify if the indices corespond to each element in a block 174 and form the IS with compressed IS */ 175 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 176 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 177 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 178 count = 0; 179 for (i=0; i<a->mbs; i++) { 180 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 181 if (vary[i]==bs) iary[count++] = i; 182 } 183 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 184 185 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 186 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 187 count = 0; 188 for (i=0; i<a->mbs; i++) { 189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 190 if (vary[i]==bs) iary[count++] = i; 191 } 192 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 193 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 194 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 195 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 196 197 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 198 ierr = ISDestroy(&is1);CHKERRQ(ierr); 199 ierr = ISDestroy(&is2);CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 } 202 203 #undef __FUNCT__ 204 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 205 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 206 { 207 PetscErrorCode ierr; 208 PetscInt i; 209 210 PetscFunctionBegin; 211 if (scall == MAT_INITIAL_MATRIX) { 212 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 213 } 214 215 for (i=0; i<n; i++) { 216 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 217 } 218 PetscFunctionReturn(0); 219 } 220 221 222 /* -------------------------------------------------------*/ 223 /* Should check that shapes of vectors and matrices match */ 224 /* -------------------------------------------------------*/ 225 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatMult_SeqBAIJ_1" 228 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 229 { 230 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 231 PetscScalar *z,sum; 232 const PetscScalar *x; 233 const MatScalar *v; 234 PetscErrorCode ierr; 235 PetscInt mbs,i,n; 236 const PetscInt *idx,*ii,*ridx=NULL; 237 PetscBool usecprow=a->compressedrow.use; 238 239 PetscFunctionBegin; 240 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 241 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 242 243 if (usecprow) { 244 mbs = a->compressedrow.nrows; 245 ii = a->compressedrow.i; 246 ridx = a->compressedrow.rindex; 247 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 248 } else { 249 mbs = a->mbs; 250 ii = a->i; 251 } 252 253 for (i=0; i<mbs; i++) { 254 n = ii[1] - ii[0]; 255 v = a->a + ii[0]; 256 idx = a->j + ii[0]; 257 ii++; 258 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 259 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 260 sum = 0.0; 261 PetscSparseDensePlusDot(sum,x,v,idx,n); 262 if (usecprow) { 263 z[ridx[i]] = sum; 264 } else { 265 z[i] = sum; 266 } 267 } 268 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 269 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 270 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 271 PetscFunctionReturn(0); 272 } 273 274 #undef __FUNCT__ 275 #define __FUNCT__ "MatMult_SeqBAIJ_2" 276 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 277 { 278 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 279 PetscScalar *z = 0,sum1,sum2,*zarray; 280 const PetscScalar *x,*xb; 281 PetscScalar x1,x2; 282 const MatScalar *v; 283 PetscErrorCode ierr; 284 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 285 PetscBool usecprow=a->compressedrow.use; 286 287 PetscFunctionBegin; 288 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 289 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 290 291 idx = a->j; 292 v = a->a; 293 if (usecprow) { 294 mbs = a->compressedrow.nrows; 295 ii = a->compressedrow.i; 296 ridx = a->compressedrow.rindex; 297 } else { 298 mbs = a->mbs; 299 ii = a->i; 300 z = zarray; 301 } 302 303 for (i=0; i<mbs; i++) { 304 n = ii[1] - ii[0]; ii++; 305 sum1 = 0.0; sum2 = 0.0; 306 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 307 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 308 for (j=0; j<n; j++) { 309 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 310 sum1 += v[0]*x1 + v[2]*x2; 311 sum2 += v[1]*x1 + v[3]*x2; 312 v += 4; 313 } 314 if (usecprow) z = zarray + 2*ridx[i]; 315 z[0] = sum1; z[1] = sum2; 316 if (!usecprow) z += 2; 317 } 318 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 319 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 320 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 321 PetscFunctionReturn(0); 322 } 323 324 #undef __FUNCT__ 325 #define __FUNCT__ "MatMult_SeqBAIJ_3" 326 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 327 { 328 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 329 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 330 const PetscScalar *x,*xb; 331 const MatScalar *v; 332 PetscErrorCode ierr; 333 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 334 PetscBool usecprow=a->compressedrow.use; 335 336 337 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 338 #pragma disjoint(*v,*z,*xb) 339 #endif 340 341 PetscFunctionBegin; 342 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 343 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 344 345 idx = a->j; 346 v = a->a; 347 if (usecprow) { 348 mbs = a->compressedrow.nrows; 349 ii = a->compressedrow.i; 350 ridx = a->compressedrow.rindex; 351 } else { 352 mbs = a->mbs; 353 ii = a->i; 354 z = zarray; 355 } 356 357 for (i=0; i<mbs; i++) { 358 n = ii[1] - ii[0]; ii++; 359 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 360 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 361 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 362 for (j=0; j<n; j++) { 363 xb = x + 3*(*idx++); 364 x1 = xb[0]; 365 x2 = xb[1]; 366 x3 = xb[2]; 367 368 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 369 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 370 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 371 v += 9; 372 } 373 if (usecprow) z = zarray + 3*ridx[i]; 374 z[0] = sum1; z[1] = sum2; z[2] = sum3; 375 if (!usecprow) z += 3; 376 } 377 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 378 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 379 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 380 PetscFunctionReturn(0); 381 } 382 383 #undef __FUNCT__ 384 #define __FUNCT__ "MatMult_SeqBAIJ_4" 385 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 386 { 387 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 388 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 389 const PetscScalar *x,*xb; 390 const MatScalar *v; 391 PetscErrorCode ierr; 392 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 393 PetscBool usecprow=a->compressedrow.use; 394 395 PetscFunctionBegin; 396 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 397 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 398 399 idx = a->j; 400 v = a->a; 401 if (usecprow) { 402 mbs = a->compressedrow.nrows; 403 ii = a->compressedrow.i; 404 ridx = a->compressedrow.rindex; 405 } else { 406 mbs = a->mbs; 407 ii = a->i; 408 z = zarray; 409 } 410 411 for (i=0; i<mbs; i++) { 412 n = ii[1] - ii[0]; 413 ii++; 414 sum1 = 0.0; 415 sum2 = 0.0; 416 sum3 = 0.0; 417 sum4 = 0.0; 418 419 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 420 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 421 for (j=0; j<n; j++) { 422 xb = x + 4*(*idx++); 423 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 424 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 425 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 426 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 427 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 428 v += 16; 429 } 430 if (usecprow) z = zarray + 4*ridx[i]; 431 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 432 if (!usecprow) z += 4; 433 } 434 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 435 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 436 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 437 PetscFunctionReturn(0); 438 } 439 440 #undef __FUNCT__ 441 #define __FUNCT__ "MatMult_SeqBAIJ_5" 442 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 443 { 444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 445 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 446 const PetscScalar *xb,*x; 447 const MatScalar *v; 448 PetscErrorCode ierr; 449 const PetscInt *idx,*ii,*ridx=NULL; 450 PetscInt mbs,i,j,n; 451 PetscBool usecprow=a->compressedrow.use; 452 453 PetscFunctionBegin; 454 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 455 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 456 457 idx = a->j; 458 v = a->a; 459 if (usecprow) { 460 mbs = a->compressedrow.nrows; 461 ii = a->compressedrow.i; 462 ridx = a->compressedrow.rindex; 463 } else { 464 mbs = a->mbs; 465 ii = a->i; 466 z = zarray; 467 } 468 469 for (i=0; i<mbs; i++) { 470 n = ii[1] - ii[0]; ii++; 471 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 472 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 473 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 474 for (j=0; j<n; j++) { 475 xb = x + 5*(*idx++); 476 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 477 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 478 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 479 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 480 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 481 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 482 v += 25; 483 } 484 if (usecprow) z = zarray + 5*ridx[i]; 485 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 486 if (!usecprow) z += 5; 487 } 488 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 489 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 490 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 491 PetscFunctionReturn(0); 492 } 493 494 495 #undef __FUNCT__ 496 #define __FUNCT__ "MatMult_SeqBAIJ_6" 497 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 498 { 499 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 500 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 501 const PetscScalar *x,*xb; 502 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 503 const MatScalar *v; 504 PetscErrorCode ierr; 505 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 506 PetscBool usecprow=a->compressedrow.use; 507 508 PetscFunctionBegin; 509 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 510 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 511 512 idx = a->j; 513 v = a->a; 514 if (usecprow) { 515 mbs = a->compressedrow.nrows; 516 ii = a->compressedrow.i; 517 ridx = a->compressedrow.rindex; 518 } else { 519 mbs = a->mbs; 520 ii = a->i; 521 z = zarray; 522 } 523 524 for (i=0; i<mbs; i++) { 525 n = ii[1] - ii[0]; 526 ii++; 527 sum1 = 0.0; 528 sum2 = 0.0; 529 sum3 = 0.0; 530 sum4 = 0.0; 531 sum5 = 0.0; 532 sum6 = 0.0; 533 534 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 535 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 536 for (j=0; j<n; j++) { 537 xb = x + 6*(*idx++); 538 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 539 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 540 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 541 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 542 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 543 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 544 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 545 v += 36; 546 } 547 if (usecprow) z = zarray + 6*ridx[i]; 548 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 549 if (!usecprow) z += 6; 550 } 551 552 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 554 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 555 PetscFunctionReturn(0); 556 } 557 558 #undef __FUNCT__ 559 #define __FUNCT__ "MatMult_SeqBAIJ_7" 560 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 561 { 562 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 563 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 564 const PetscScalar *x,*xb; 565 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 566 const MatScalar *v; 567 PetscErrorCode ierr; 568 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 569 PetscBool usecprow=a->compressedrow.use; 570 571 PetscFunctionBegin; 572 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 573 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 574 575 idx = a->j; 576 v = a->a; 577 if (usecprow) { 578 mbs = a->compressedrow.nrows; 579 ii = a->compressedrow.i; 580 ridx = a->compressedrow.rindex; 581 } else { 582 mbs = a->mbs; 583 ii = a->i; 584 z = zarray; 585 } 586 587 for (i=0; i<mbs; i++) { 588 n = ii[1] - ii[0]; 589 ii++; 590 sum1 = 0.0; 591 sum2 = 0.0; 592 sum3 = 0.0; 593 sum4 = 0.0; 594 sum5 = 0.0; 595 sum6 = 0.0; 596 sum7 = 0.0; 597 598 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 599 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 600 for (j=0; j<n; j++) { 601 xb = x + 7*(*idx++); 602 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 603 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 604 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 605 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 606 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 607 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 608 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 609 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 610 v += 49; 611 } 612 if (usecprow) z = zarray + 7*ridx[i]; 613 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 614 if (!usecprow) z += 7; 615 } 616 617 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 618 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 619 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 620 PetscFunctionReturn(0); 621 } 622 623 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 624 /* Default MatMult for block size 15 */ 625 626 #undef __FUNCT__ 627 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 628 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 629 { 630 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 631 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 632 const PetscScalar *x,*xb; 633 PetscScalar *zarray,xv; 634 const MatScalar *v; 635 PetscErrorCode ierr; 636 const PetscInt *ii,*ij=a->j,*idx; 637 PetscInt mbs,i,j,k,n,*ridx=NULL; 638 PetscBool usecprow=a->compressedrow.use; 639 640 PetscFunctionBegin; 641 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 642 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 643 644 v = a->a; 645 if (usecprow) { 646 mbs = a->compressedrow.nrows; 647 ii = a->compressedrow.i; 648 ridx = a->compressedrow.rindex; 649 } else { 650 mbs = a->mbs; 651 ii = a->i; 652 z = zarray; 653 } 654 655 for (i=0; i<mbs; i++) { 656 n = ii[i+1] - ii[i]; 657 idx = ij + ii[i]; 658 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 659 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 660 661 for (j=0; j<n; j++) { 662 xb = x + 15*(idx[j]); 663 664 for (k=0; k<15; k++) { 665 xv = xb[k]; 666 sum1 += v[0]*xv; 667 sum2 += v[1]*xv; 668 sum3 += v[2]*xv; 669 sum4 += v[3]*xv; 670 sum5 += v[4]*xv; 671 sum6 += v[5]*xv; 672 sum7 += v[6]*xv; 673 sum8 += v[7]*xv; 674 sum9 += v[8]*xv; 675 sum10 += v[9]*xv; 676 sum11 += v[10]*xv; 677 sum12 += v[11]*xv; 678 sum13 += v[12]*xv; 679 sum14 += v[13]*xv; 680 sum15 += v[14]*xv; 681 v += 15; 682 } 683 } 684 if (usecprow) z = zarray + 15*ridx[i]; 685 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 686 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 687 688 if (!usecprow) z += 15; 689 } 690 691 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 692 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 693 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 698 #undef __FUNCT__ 699 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 700 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 701 { 702 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 703 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 704 const PetscScalar *x,*xb; 705 PetscScalar x1,x2,x3,x4,*zarray; 706 const MatScalar *v; 707 PetscErrorCode ierr; 708 const PetscInt *ii,*ij=a->j,*idx; 709 PetscInt mbs,i,j,n,*ridx=NULL; 710 PetscBool usecprow=a->compressedrow.use; 711 712 PetscFunctionBegin; 713 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 714 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 715 716 v = a->a; 717 if (usecprow) { 718 mbs = a->compressedrow.nrows; 719 ii = a->compressedrow.i; 720 ridx = a->compressedrow.rindex; 721 } else { 722 mbs = a->mbs; 723 ii = a->i; 724 z = zarray; 725 } 726 727 for (i=0; i<mbs; i++) { 728 n = ii[i+1] - ii[i]; 729 idx = ij + ii[i]; 730 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 731 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 732 733 for (j=0; j<n; j++) { 734 xb = x + 15*(idx[j]); 735 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 736 737 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 738 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 739 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 740 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 741 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 742 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 743 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 744 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 745 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 746 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 747 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 748 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 749 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 750 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 751 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 752 753 v += 60; 754 755 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 756 757 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 758 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 759 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 760 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 761 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 762 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 763 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 764 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 765 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 766 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 767 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 768 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 769 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 770 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 771 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 772 v += 60; 773 774 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 775 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 776 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 777 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 778 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 779 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 780 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 781 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 782 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 783 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 784 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 785 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 786 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 787 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 788 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 789 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 790 v += 60; 791 792 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 793 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 794 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 795 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 796 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 797 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 798 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 799 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 800 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 801 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 802 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 803 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 804 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 805 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 806 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 807 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 808 v += 45; 809 } 810 if (usecprow) z = zarray + 15*ridx[i]; 811 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 812 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 813 814 if (!usecprow) z += 15; 815 } 816 817 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 818 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 819 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 820 PetscFunctionReturn(0); 821 } 822 823 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 824 #undef __FUNCT__ 825 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 826 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 827 { 828 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 829 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 830 const PetscScalar *x,*xb; 831 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 832 const MatScalar *v; 833 PetscErrorCode ierr; 834 const PetscInt *ii,*ij=a->j,*idx; 835 PetscInt mbs,i,j,n,*ridx=NULL; 836 PetscBool usecprow=a->compressedrow.use; 837 838 PetscFunctionBegin; 839 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 840 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 841 842 v = a->a; 843 if (usecprow) { 844 mbs = a->compressedrow.nrows; 845 ii = a->compressedrow.i; 846 ridx = a->compressedrow.rindex; 847 } else { 848 mbs = a->mbs; 849 ii = a->i; 850 z = zarray; 851 } 852 853 for (i=0; i<mbs; i++) { 854 n = ii[i+1] - ii[i]; 855 idx = ij + ii[i]; 856 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 857 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 858 859 for (j=0; j<n; j++) { 860 xb = x + 15*(idx[j]); 861 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 862 x8 = xb[7]; 863 864 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 865 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 866 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 867 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 868 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 869 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 870 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 871 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 872 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 873 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 874 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 875 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 876 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 877 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 878 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 879 v += 120; 880 881 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 882 883 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 884 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 885 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 886 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 887 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 888 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 889 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 890 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 891 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 892 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 893 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 894 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 895 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 896 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 897 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 898 v += 105; 899 } 900 if (usecprow) z = zarray + 15*ridx[i]; 901 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 902 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 903 904 if (!usecprow) z += 15; 905 } 906 907 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 908 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 909 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 910 PetscFunctionReturn(0); 911 } 912 913 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 914 915 #undef __FUNCT__ 916 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 917 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 918 { 919 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 920 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 921 const PetscScalar *x,*xb; 922 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 923 const MatScalar *v; 924 PetscErrorCode ierr; 925 const PetscInt *ii,*ij=a->j,*idx; 926 PetscInt mbs,i,j,n,*ridx=NULL; 927 PetscBool usecprow=a->compressedrow.use; 928 929 PetscFunctionBegin; 930 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 931 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 932 933 v = a->a; 934 if (usecprow) { 935 mbs = a->compressedrow.nrows; 936 ii = a->compressedrow.i; 937 ridx = a->compressedrow.rindex; 938 } else { 939 mbs = a->mbs; 940 ii = a->i; 941 z = zarray; 942 } 943 944 for (i=0; i<mbs; i++) { 945 n = ii[i+1] - ii[i]; 946 idx = ij + ii[i]; 947 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 948 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 949 950 for (j=0; j<n; j++) { 951 xb = x + 15*(idx[j]); 952 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 953 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 954 955 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 956 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 957 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 958 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 959 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 960 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 961 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 962 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 963 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 964 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 965 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 966 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 967 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 968 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 969 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 970 v += 225; 971 } 972 if (usecprow) z = zarray + 15*ridx[i]; 973 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 974 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 975 976 if (!usecprow) z += 15; 977 } 978 979 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 980 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 981 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 982 PetscFunctionReturn(0); 983 } 984 985 986 /* 987 This will not work with MatScalar == float because it calls the BLAS 988 */ 989 #undef __FUNCT__ 990 #define __FUNCT__ "MatMult_SeqBAIJ_N" 991 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 992 { 993 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 994 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 995 MatScalar *v; 996 PetscErrorCode ierr; 997 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 998 PetscInt ncols,k,*ridx=NULL; 999 PetscBool usecprow=a->compressedrow.use; 1000 1001 PetscFunctionBegin; 1002 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1003 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1004 1005 idx = a->j; 1006 v = a->a; 1007 if (usecprow) { 1008 mbs = a->compressedrow.nrows; 1009 ii = a->compressedrow.i; 1010 ridx = a->compressedrow.rindex; 1011 } else { 1012 mbs = a->mbs; 1013 ii = a->i; 1014 z = zarray; 1015 } 1016 1017 if (!a->mult_work) { 1018 k = PetscMax(A->rmap->n,A->cmap->n); 1019 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1020 } 1021 work = a->mult_work; 1022 for (i=0; i<mbs; i++) { 1023 n = ii[1] - ii[0]; ii++; 1024 ncols = n*bs; 1025 workt = work; 1026 for (j=0; j<n; j++) { 1027 xb = x + bs*(*idx++); 1028 for (k=0; k<bs; k++) workt[k] = xb[k]; 1029 workt += bs; 1030 } 1031 if (usecprow) z = zarray + bs*ridx[i]; 1032 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1033 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1034 v += n*bs2; 1035 if (!usecprow) z += bs; 1036 } 1037 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1038 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1039 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1040 PetscFunctionReturn(0); 1041 } 1042 1043 #undef __FUNCT__ 1044 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1045 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1046 { 1047 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1048 const PetscScalar *x; 1049 PetscScalar *y,*z,sum; 1050 const MatScalar *v; 1051 PetscErrorCode ierr; 1052 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1053 const PetscInt *idx,*ii; 1054 PetscBool usecprow=a->compressedrow.use; 1055 1056 PetscFunctionBegin; 1057 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1058 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1059 if (zz != yy) { 1060 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1061 } else { 1062 z = y; 1063 } 1064 1065 idx = a->j; 1066 v = a->a; 1067 if (usecprow) { 1068 if (zz != yy) { 1069 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1070 } 1071 mbs = a->compressedrow.nrows; 1072 ii = a->compressedrow.i; 1073 ridx = a->compressedrow.rindex; 1074 } else { 1075 ii = a->i; 1076 } 1077 1078 for (i=0; i<mbs; i++) { 1079 n = ii[1] - ii[0]; 1080 ii++; 1081 if (!usecprow) { 1082 sum = y[i]; 1083 } else { 1084 sum = y[ridx[i]]; 1085 } 1086 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1087 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1088 PetscSparseDensePlusDot(sum,x,v,idx,n); 1089 v += n; 1090 idx += n; 1091 if (usecprow) { 1092 z[ridx[i]] = sum; 1093 } else { 1094 z[i] = sum; 1095 } 1096 } 1097 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1098 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1099 if (zz != yy) { 1100 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1101 } 1102 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 #undef __FUNCT__ 1107 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1108 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1109 { 1110 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1111 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1112 PetscScalar x1,x2,*yarray,*zarray; 1113 MatScalar *v; 1114 PetscErrorCode ierr; 1115 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1116 PetscBool usecprow=a->compressedrow.use; 1117 1118 PetscFunctionBegin; 1119 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1120 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1121 if (zz != yy) { 1122 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1123 } else { 1124 zarray = yarray; 1125 } 1126 1127 idx = a->j; 1128 v = a->a; 1129 if (usecprow) { 1130 if (zz != yy) { 1131 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1132 } 1133 mbs = a->compressedrow.nrows; 1134 ii = a->compressedrow.i; 1135 ridx = a->compressedrow.rindex; 1136 if (zz != yy) { 1137 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1138 } 1139 } else { 1140 ii = a->i; 1141 y = yarray; 1142 z = zarray; 1143 } 1144 1145 for (i=0; i<mbs; i++) { 1146 n = ii[1] - ii[0]; ii++; 1147 if (usecprow) { 1148 z = zarray + 2*ridx[i]; 1149 y = yarray + 2*ridx[i]; 1150 } 1151 sum1 = y[0]; sum2 = y[1]; 1152 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1153 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1154 for (j=0; j<n; j++) { 1155 xb = x + 2*(*idx++); 1156 x1 = xb[0]; 1157 x2 = xb[1]; 1158 1159 sum1 += v[0]*x1 + v[2]*x2; 1160 sum2 += v[1]*x1 + v[3]*x2; 1161 v += 4; 1162 } 1163 z[0] = sum1; z[1] = sum2; 1164 if (!usecprow) { 1165 z += 2; y += 2; 1166 } 1167 } 1168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1170 if (zz != yy) { 1171 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1172 } 1173 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 #undef __FUNCT__ 1178 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1179 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1180 { 1181 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1182 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1183 MatScalar *v; 1184 PetscErrorCode ierr; 1185 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1186 PetscBool usecprow=a->compressedrow.use; 1187 1188 PetscFunctionBegin; 1189 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1190 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1191 if (zz != yy) { 1192 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1193 } else { 1194 zarray = yarray; 1195 } 1196 1197 idx = a->j; 1198 v = a->a; 1199 if (usecprow) { 1200 if (zz != yy) { 1201 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1202 } 1203 mbs = a->compressedrow.nrows; 1204 ii = a->compressedrow.i; 1205 ridx = a->compressedrow.rindex; 1206 } else { 1207 ii = a->i; 1208 y = yarray; 1209 z = zarray; 1210 } 1211 1212 for (i=0; i<mbs; i++) { 1213 n = ii[1] - ii[0]; ii++; 1214 if (usecprow) { 1215 z = zarray + 3*ridx[i]; 1216 y = yarray + 3*ridx[i]; 1217 } 1218 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1219 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1220 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1221 for (j=0; j<n; j++) { 1222 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1223 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1224 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1225 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1226 v += 9; 1227 } 1228 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1229 if (!usecprow) { 1230 z += 3; y += 3; 1231 } 1232 } 1233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1234 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1235 if (zz != yy) { 1236 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1237 } 1238 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1239 PetscFunctionReturn(0); 1240 } 1241 1242 #undef __FUNCT__ 1243 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1244 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1245 { 1246 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1247 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1248 MatScalar *v; 1249 PetscErrorCode ierr; 1250 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1251 PetscBool usecprow=a->compressedrow.use; 1252 1253 PetscFunctionBegin; 1254 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1255 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1256 if (zz != yy) { 1257 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1258 } else { 1259 zarray = yarray; 1260 } 1261 1262 idx = a->j; 1263 v = a->a; 1264 if (usecprow) { 1265 if (zz != yy) { 1266 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1267 } 1268 mbs = a->compressedrow.nrows; 1269 ii = a->compressedrow.i; 1270 ridx = a->compressedrow.rindex; 1271 } else { 1272 ii = a->i; 1273 y = yarray; 1274 z = zarray; 1275 } 1276 1277 for (i=0; i<mbs; i++) { 1278 n = ii[1] - ii[0]; ii++; 1279 if (usecprow) { 1280 z = zarray + 4*ridx[i]; 1281 y = yarray + 4*ridx[i]; 1282 } 1283 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1284 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1285 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1286 for (j=0; j<n; j++) { 1287 xb = x + 4*(*idx++); 1288 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1289 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1290 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1291 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1292 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1293 v += 16; 1294 } 1295 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1296 if (!usecprow) { 1297 z += 4; y += 4; 1298 } 1299 } 1300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1301 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1302 if (zz != yy) { 1303 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1304 } 1305 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1306 PetscFunctionReturn(0); 1307 } 1308 1309 #undef __FUNCT__ 1310 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1311 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1312 { 1313 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1314 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1315 PetscScalar *yarray,*zarray; 1316 MatScalar *v; 1317 PetscErrorCode ierr; 1318 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1319 PetscBool usecprow=a->compressedrow.use; 1320 1321 PetscFunctionBegin; 1322 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1323 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1324 if (zz != yy) { 1325 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1326 } else { 1327 zarray = yarray; 1328 } 1329 1330 idx = a->j; 1331 v = a->a; 1332 if (usecprow) { 1333 if (zz != yy) { 1334 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1335 } 1336 mbs = a->compressedrow.nrows; 1337 ii = a->compressedrow.i; 1338 ridx = a->compressedrow.rindex; 1339 } else { 1340 ii = a->i; 1341 y = yarray; 1342 z = zarray; 1343 } 1344 1345 for (i=0; i<mbs; i++) { 1346 n = ii[1] - ii[0]; ii++; 1347 if (usecprow) { 1348 z = zarray + 5*ridx[i]; 1349 y = yarray + 5*ridx[i]; 1350 } 1351 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1352 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1353 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1354 for (j=0; j<n; j++) { 1355 xb = x + 5*(*idx++); 1356 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1357 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1358 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1359 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1360 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1361 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1362 v += 25; 1363 } 1364 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1365 if (!usecprow) { 1366 z += 5; y += 5; 1367 } 1368 } 1369 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1370 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1371 if (zz != yy) { 1372 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1373 } 1374 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1375 PetscFunctionReturn(0); 1376 } 1377 #undef __FUNCT__ 1378 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1379 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1380 { 1381 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1382 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1383 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1384 MatScalar *v; 1385 PetscErrorCode ierr; 1386 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1387 PetscBool usecprow=a->compressedrow.use; 1388 1389 PetscFunctionBegin; 1390 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1391 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1392 if (zz != yy) { 1393 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1394 } else { 1395 zarray = yarray; 1396 } 1397 1398 idx = a->j; 1399 v = a->a; 1400 if (usecprow) { 1401 if (zz != yy) { 1402 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1403 } 1404 mbs = a->compressedrow.nrows; 1405 ii = a->compressedrow.i; 1406 ridx = a->compressedrow.rindex; 1407 } else { 1408 ii = a->i; 1409 y = yarray; 1410 z = zarray; 1411 } 1412 1413 for (i=0; i<mbs; i++) { 1414 n = ii[1] - ii[0]; ii++; 1415 if (usecprow) { 1416 z = zarray + 6*ridx[i]; 1417 y = yarray + 6*ridx[i]; 1418 } 1419 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1420 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1421 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1422 for (j=0; j<n; j++) { 1423 xb = x + 6*(*idx++); 1424 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1425 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1426 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1427 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1428 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1429 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1430 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1431 v += 36; 1432 } 1433 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1434 if (!usecprow) { 1435 z += 6; y += 6; 1436 } 1437 } 1438 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1439 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1440 if (zz != yy) { 1441 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1442 } 1443 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1444 PetscFunctionReturn(0); 1445 } 1446 1447 #undef __FUNCT__ 1448 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1449 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1450 { 1451 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1452 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1453 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1454 MatScalar *v; 1455 PetscErrorCode ierr; 1456 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1457 PetscBool usecprow=a->compressedrow.use; 1458 1459 PetscFunctionBegin; 1460 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1461 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1462 if (zz != yy) { 1463 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1464 } else { 1465 zarray = yarray; 1466 } 1467 1468 idx = a->j; 1469 v = a->a; 1470 if (usecprow) { 1471 if (zz != yy) { 1472 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1473 } 1474 mbs = a->compressedrow.nrows; 1475 ii = a->compressedrow.i; 1476 ridx = a->compressedrow.rindex; 1477 } else { 1478 ii = a->i; 1479 y = yarray; 1480 z = zarray; 1481 } 1482 1483 for (i=0; i<mbs; i++) { 1484 n = ii[1] - ii[0]; ii++; 1485 if (usecprow) { 1486 z = zarray + 7*ridx[i]; 1487 y = yarray + 7*ridx[i]; 1488 } 1489 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1490 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1491 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1492 for (j=0; j<n; j++) { 1493 xb = x + 7*(*idx++); 1494 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1495 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1496 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1497 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1498 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1499 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1500 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1501 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1502 v += 49; 1503 } 1504 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1505 if (!usecprow) { 1506 z += 7; y += 7; 1507 } 1508 } 1509 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1510 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1511 if (zz != yy) { 1512 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1513 } 1514 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1515 PetscFunctionReturn(0); 1516 } 1517 1518 #undef __FUNCT__ 1519 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1520 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1521 { 1522 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1523 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1524 MatScalar *v; 1525 PetscErrorCode ierr; 1526 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1527 PetscInt ncols,k,*ridx=NULL; 1528 PetscBool usecprow=a->compressedrow.use; 1529 1530 PetscFunctionBegin; 1531 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1532 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1533 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1534 1535 idx = a->j; 1536 v = a->a; 1537 if (usecprow) { 1538 mbs = a->compressedrow.nrows; 1539 ii = a->compressedrow.i; 1540 ridx = a->compressedrow.rindex; 1541 } else { 1542 mbs = a->mbs; 1543 ii = a->i; 1544 z = zarray; 1545 } 1546 1547 if (!a->mult_work) { 1548 k = PetscMax(A->rmap->n,A->cmap->n); 1549 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1550 } 1551 work = a->mult_work; 1552 for (i=0; i<mbs; i++) { 1553 n = ii[1] - ii[0]; ii++; 1554 ncols = n*bs; 1555 workt = work; 1556 for (j=0; j<n; j++) { 1557 xb = x + bs*(*idx++); 1558 for (k=0; k<bs; k++) workt[k] = xb[k]; 1559 workt += bs; 1560 } 1561 if (usecprow) z = zarray + bs*ridx[i]; 1562 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1563 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1564 v += n*bs2; 1565 if (!usecprow) z += bs; 1566 } 1567 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1568 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1569 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 #undef __FUNCT__ 1574 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1575 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1576 { 1577 PetscScalar zero = 0.0; 1578 PetscErrorCode ierr; 1579 1580 PetscFunctionBegin; 1581 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1582 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 #undef __FUNCT__ 1587 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1588 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1589 { 1590 PetscScalar zero = 0.0; 1591 PetscErrorCode ierr; 1592 1593 PetscFunctionBegin; 1594 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1595 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 #undef __FUNCT__ 1600 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1601 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1602 { 1603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1604 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1605 MatScalar *v; 1606 PetscErrorCode ierr; 1607 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 1608 Mat_CompressedRow cprow = a->compressedrow; 1609 PetscBool usecprow=cprow.use; 1610 1611 PetscFunctionBegin; 1612 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1613 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1614 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1615 1616 idx = a->j; 1617 v = a->a; 1618 if (usecprow) { 1619 mbs = cprow.nrows; 1620 ii = cprow.i; 1621 ridx = cprow.rindex; 1622 } else { 1623 mbs=a->mbs; 1624 ii = a->i; 1625 xb = x; 1626 } 1627 1628 switch (bs) { 1629 case 1: 1630 for (i=0; i<mbs; i++) { 1631 if (usecprow) xb = x + ridx[i]; 1632 x1 = xb[0]; 1633 ib = idx + ii[0]; 1634 n = ii[1] - ii[0]; ii++; 1635 for (j=0; j<n; j++) { 1636 rval = ib[j]; 1637 z[rval] += PetscConj(*v) * x1; 1638 v++; 1639 } 1640 if (!usecprow) xb++; 1641 } 1642 break; 1643 case 2: 1644 for (i=0; i<mbs; i++) { 1645 if (usecprow) xb = x + 2*ridx[i]; 1646 x1 = xb[0]; x2 = xb[1]; 1647 ib = idx + ii[0]; 1648 n = ii[1] - ii[0]; ii++; 1649 for (j=0; j<n; j++) { 1650 rval = ib[j]*2; 1651 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1652 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1653 v += 4; 1654 } 1655 if (!usecprow) xb += 2; 1656 } 1657 break; 1658 case 3: 1659 for (i=0; i<mbs; i++) { 1660 if (usecprow) xb = x + 3*ridx[i]; 1661 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1662 ib = idx + ii[0]; 1663 n = ii[1] - ii[0]; ii++; 1664 for (j=0; j<n; j++) { 1665 rval = ib[j]*3; 1666 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1667 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1668 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1669 v += 9; 1670 } 1671 if (!usecprow) xb += 3; 1672 } 1673 break; 1674 case 4: 1675 for (i=0; i<mbs; i++) { 1676 if (usecprow) xb = x + 4*ridx[i]; 1677 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1678 ib = idx + ii[0]; 1679 n = ii[1] - ii[0]; ii++; 1680 for (j=0; j<n; j++) { 1681 rval = ib[j]*4; 1682 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1683 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1684 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1685 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1686 v += 16; 1687 } 1688 if (!usecprow) xb += 4; 1689 } 1690 break; 1691 case 5: 1692 for (i=0; i<mbs; i++) { 1693 if (usecprow) xb = x + 5*ridx[i]; 1694 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1695 x4 = xb[3]; x5 = xb[4]; 1696 ib = idx + ii[0]; 1697 n = ii[1] - ii[0]; ii++; 1698 for (j=0; j<n; j++) { 1699 rval = ib[j]*5; 1700 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1701 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1702 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1703 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1704 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1705 v += 25; 1706 } 1707 if (!usecprow) xb += 5; 1708 } 1709 break; 1710 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1711 PetscInt ncols,k; 1712 PetscScalar *work,*workt,*xtmp; 1713 1714 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1715 if (!a->mult_work) { 1716 k = PetscMax(A->rmap->n,A->cmap->n); 1717 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1718 } 1719 work = a->mult_work; 1720 xtmp = x; 1721 for (i=0; i<mbs; i++) { 1722 n = ii[1] - ii[0]; ii++; 1723 ncols = n*bs; 1724 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1725 if (usecprow) xtmp = x + bs*ridx[i]; 1726 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1727 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1728 v += n*bs2; 1729 if (!usecprow) xtmp += bs; 1730 workt = work; 1731 for (j=0; j<n; j++) { 1732 zb = z + bs*(*idx++); 1733 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1734 workt += bs; 1735 } 1736 } 1737 } 1738 } 1739 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1740 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1741 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1742 PetscFunctionReturn(0); 1743 } 1744 1745 #undef __FUNCT__ 1746 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1747 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1748 { 1749 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1750 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1751 MatScalar *v; 1752 PetscErrorCode ierr; 1753 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 1754 Mat_CompressedRow cprow = a->compressedrow; 1755 PetscBool usecprow=cprow.use; 1756 1757 PetscFunctionBegin; 1758 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1759 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1760 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1761 1762 idx = a->j; 1763 v = a->a; 1764 if (usecprow) { 1765 mbs = cprow.nrows; 1766 ii = cprow.i; 1767 ridx = cprow.rindex; 1768 } else { 1769 mbs=a->mbs; 1770 ii = a->i; 1771 xb = x; 1772 } 1773 1774 switch (bs) { 1775 case 1: 1776 for (i=0; i<mbs; i++) { 1777 if (usecprow) xb = x + ridx[i]; 1778 x1 = xb[0]; 1779 ib = idx + ii[0]; 1780 n = ii[1] - ii[0]; ii++; 1781 for (j=0; j<n; j++) { 1782 rval = ib[j]; 1783 z[rval] += *v * x1; 1784 v++; 1785 } 1786 if (!usecprow) xb++; 1787 } 1788 break; 1789 case 2: 1790 for (i=0; i<mbs; i++) { 1791 if (usecprow) xb = x + 2*ridx[i]; 1792 x1 = xb[0]; x2 = xb[1]; 1793 ib = idx + ii[0]; 1794 n = ii[1] - ii[0]; ii++; 1795 for (j=0; j<n; j++) { 1796 rval = ib[j]*2; 1797 z[rval++] += v[0]*x1 + v[1]*x2; 1798 z[rval++] += v[2]*x1 + v[3]*x2; 1799 v += 4; 1800 } 1801 if (!usecprow) xb += 2; 1802 } 1803 break; 1804 case 3: 1805 for (i=0; i<mbs; i++) { 1806 if (usecprow) xb = x + 3*ridx[i]; 1807 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1808 ib = idx + ii[0]; 1809 n = ii[1] - ii[0]; ii++; 1810 for (j=0; j<n; j++) { 1811 rval = ib[j]*3; 1812 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1813 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1814 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1815 v += 9; 1816 } 1817 if (!usecprow) xb += 3; 1818 } 1819 break; 1820 case 4: 1821 for (i=0; i<mbs; i++) { 1822 if (usecprow) xb = x + 4*ridx[i]; 1823 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1824 ib = idx + ii[0]; 1825 n = ii[1] - ii[0]; ii++; 1826 for (j=0; j<n; j++) { 1827 rval = ib[j]*4; 1828 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1829 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1830 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1831 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1832 v += 16; 1833 } 1834 if (!usecprow) xb += 4; 1835 } 1836 break; 1837 case 5: 1838 for (i=0; i<mbs; i++) { 1839 if (usecprow) xb = x + 5*ridx[i]; 1840 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1841 x4 = xb[3]; x5 = xb[4]; 1842 ib = idx + ii[0]; 1843 n = ii[1] - ii[0]; ii++; 1844 for (j=0; j<n; j++) { 1845 rval = ib[j]*5; 1846 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1847 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1848 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1849 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1850 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1851 v += 25; 1852 } 1853 if (!usecprow) xb += 5; 1854 } 1855 break; 1856 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1857 PetscInt ncols,k; 1858 PetscScalar *work,*workt,*xtmp; 1859 1860 if (!a->mult_work) { 1861 k = PetscMax(A->rmap->n,A->cmap->n); 1862 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1863 } 1864 work = a->mult_work; 1865 xtmp = x; 1866 for (i=0; i<mbs; i++) { 1867 n = ii[1] - ii[0]; ii++; 1868 ncols = n*bs; 1869 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1870 if (usecprow) xtmp = x + bs*ridx[i]; 1871 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1872 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1873 v += n*bs2; 1874 if (!usecprow) xtmp += bs; 1875 workt = work; 1876 for (j=0; j<n; j++) { 1877 zb = z + bs*(*idx++); 1878 for (k=0; k<bs; k++) zb[k] += workt[k]; 1879 workt += bs; 1880 } 1881 } 1882 } 1883 } 1884 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1885 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1886 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1887 PetscFunctionReturn(0); 1888 } 1889 1890 #undef __FUNCT__ 1891 #define __FUNCT__ "MatScale_SeqBAIJ" 1892 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1893 { 1894 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1895 PetscInt totalnz = a->bs2*a->nz; 1896 PetscScalar oalpha = alpha; 1897 PetscErrorCode ierr; 1898 PetscBLASInt one = 1,tnz; 1899 1900 PetscFunctionBegin; 1901 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1902 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1903 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1904 PetscFunctionReturn(0); 1905 } 1906 1907 #undef __FUNCT__ 1908 #define __FUNCT__ "MatNorm_SeqBAIJ" 1909 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1910 { 1911 PetscErrorCode ierr; 1912 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1913 MatScalar *v = a->a; 1914 PetscReal sum = 0.0; 1915 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1916 1917 PetscFunctionBegin; 1918 if (type == NORM_FROBENIUS) { 1919 for (i=0; i< bs2*nz; i++) { 1920 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1921 } 1922 *norm = PetscSqrtReal(sum); 1923 } else if (type == NORM_1) { /* maximum column sum */ 1924 PetscReal *tmp; 1925 PetscInt *bcol = a->j; 1926 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1927 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1928 for (i=0; i<nz; i++) { 1929 for (j=0; j<bs; j++) { 1930 k1 = bs*(*bcol) + j; /* column index */ 1931 for (k=0; k<bs; k++) { 1932 tmp[k1] += PetscAbsScalar(*v); v++; 1933 } 1934 } 1935 bcol++; 1936 } 1937 *norm = 0.0; 1938 for (j=0; j<A->cmap->n; j++) { 1939 if (tmp[j] > *norm) *norm = tmp[j]; 1940 } 1941 ierr = PetscFree(tmp);CHKERRQ(ierr); 1942 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1943 *norm = 0.0; 1944 for (k=0; k<bs; k++) { 1945 for (j=0; j<a->mbs; j++) { 1946 v = a->a + bs2*a->i[j] + k; 1947 sum = 0.0; 1948 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1949 for (k1=0; k1<bs; k1++) { 1950 sum += PetscAbsScalar(*v); 1951 v += bs; 1952 } 1953 } 1954 if (sum > *norm) *norm = sum; 1955 } 1956 } 1957 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1958 PetscFunctionReturn(0); 1959 } 1960 1961 1962 #undef __FUNCT__ 1963 #define __FUNCT__ "MatEqual_SeqBAIJ" 1964 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1965 { 1966 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1967 PetscErrorCode ierr; 1968 1969 PetscFunctionBegin; 1970 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1971 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1972 *flg = PETSC_FALSE; 1973 PetscFunctionReturn(0); 1974 } 1975 1976 /* if the a->i are the same */ 1977 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1978 if (!*flg) PetscFunctionReturn(0); 1979 1980 /* if a->j are the same */ 1981 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1982 if (!*flg) PetscFunctionReturn(0); 1983 1984 /* if a->a are the same */ 1985 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1986 PetscFunctionReturn(0); 1987 1988 } 1989 1990 #undef __FUNCT__ 1991 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1992 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1993 { 1994 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1995 PetscErrorCode ierr; 1996 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1997 PetscScalar *x,zero = 0.0; 1998 MatScalar *aa,*aa_j; 1999 2000 PetscFunctionBegin; 2001 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2002 bs = A->rmap->bs; 2003 aa = a->a; 2004 ai = a->i; 2005 aj = a->j; 2006 ambs = a->mbs; 2007 bs2 = a->bs2; 2008 2009 ierr = VecSet(v,zero);CHKERRQ(ierr); 2010 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2011 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2012 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2013 for (i=0; i<ambs; i++) { 2014 for (j=ai[i]; j<ai[i+1]; j++) { 2015 if (aj[j] == i) { 2016 row = i*bs; 2017 aa_j = aa+j*bs2; 2018 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2019 break; 2020 } 2021 } 2022 } 2023 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2024 PetscFunctionReturn(0); 2025 } 2026 2027 #undef __FUNCT__ 2028 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2029 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2030 { 2031 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2032 const PetscScalar *l,*r,*li,*ri; 2033 PetscScalar x; 2034 MatScalar *aa, *v; 2035 PetscErrorCode ierr; 2036 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2037 const PetscInt *ai,*aj; 2038 2039 PetscFunctionBegin; 2040 ai = a->i; 2041 aj = a->j; 2042 aa = a->a; 2043 m = A->rmap->n; 2044 n = A->cmap->n; 2045 bs = A->rmap->bs; 2046 mbs = a->mbs; 2047 bs2 = a->bs2; 2048 if (ll) { 2049 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2050 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2051 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2052 for (i=0; i<mbs; i++) { /* for each block row */ 2053 M = ai[i+1] - ai[i]; 2054 li = l + i*bs; 2055 v = aa + bs2*ai[i]; 2056 for (j=0; j<M; j++) { /* for each block */ 2057 for (k=0; k<bs2; k++) { 2058 (*v++) *= li[k%bs]; 2059 } 2060 } 2061 } 2062 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2063 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2064 } 2065 2066 if (rr) { 2067 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2068 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2069 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2070 for (i=0; i<mbs; i++) { /* for each block row */ 2071 iai = ai[i]; 2072 M = ai[i+1] - iai; 2073 v = aa + bs2*iai; 2074 for (j=0; j<M; j++) { /* for each block */ 2075 ri = r + bs*aj[iai+j]; 2076 for (k=0; k<bs; k++) { 2077 x = ri[k]; 2078 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2079 v += bs; 2080 } 2081 } 2082 } 2083 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2084 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2085 } 2086 PetscFunctionReturn(0); 2087 } 2088 2089 2090 #undef __FUNCT__ 2091 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2092 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2093 { 2094 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2095 2096 PetscFunctionBegin; 2097 info->block_size = a->bs2; 2098 info->nz_allocated = a->bs2*a->maxnz; 2099 info->nz_used = a->bs2*a->nz; 2100 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2101 info->assemblies = A->num_ass; 2102 info->mallocs = A->info.mallocs; 2103 info->memory = ((PetscObject)A)->mem; 2104 if (A->factortype) { 2105 info->fill_ratio_given = A->info.fill_ratio_given; 2106 info->fill_ratio_needed = A->info.fill_ratio_needed; 2107 info->factor_mallocs = A->info.factor_mallocs; 2108 } else { 2109 info->fill_ratio_given = 0; 2110 info->fill_ratio_needed = 0; 2111 info->factor_mallocs = 0; 2112 } 2113 PetscFunctionReturn(0); 2114 } 2115 2116 2117 #undef __FUNCT__ 2118 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2119 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2120 { 2121 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2122 PetscErrorCode ierr; 2123 2124 PetscFunctionBegin; 2125 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2126 PetscFunctionReturn(0); 2127 } 2128