1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 8 { 9 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 10 PetscErrorCode ierr; 11 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 12 const PetscInt *idx; 13 PetscInt start,end,*ai,*aj,bs,*nidx2; 14 PetscBT table; 15 16 PetscFunctionBegin; 17 m = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 22 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 23 24 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 25 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 28 for (i=0; i<is_max; i++) { 29 /* Initialise the two local arrays */ 30 isz = 0; 31 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 38 for (j=0; j<n; ++j) { 39 ival = idx[j]/bs; /* convert the indices into block indices */ 40 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 42 } 43 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 44 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 45 46 k = 0; 47 for (j=0; j<ov; j++) { /* for each overlap*/ 48 n = isz; 49 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 50 row = nidx[k]; 51 start = ai[row]; 52 end = ai[row+1]; 53 for (l = start; l<end; l++) { 54 val = aj[l]; 55 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 56 } 57 } 58 } 59 /* expand the Index Set */ 60 for (j=0; j<isz; j++) { 61 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 62 } 63 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 64 } 65 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 66 ierr = PetscFree(nidx);CHKERRQ(ierr); 67 ierr = PetscFree(nidx2);CHKERRQ(ierr); 68 PetscFunctionReturn(0); 69 } 70 71 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 72 { 73 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 74 PetscErrorCode ierr; 75 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 76 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 77 const PetscInt *irow,*icol; 78 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 79 PetscInt *aj = a->j,*ai = a->i; 80 MatScalar *mat_a; 81 Mat C; 82 PetscBool flag; 83 84 PetscFunctionBegin; 85 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 86 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 87 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 88 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 89 90 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 91 ssmap = smap; 92 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 93 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 94 /* determine lens of each row */ 95 for (i=0; i<nrows; i++) { 96 kstart = ai[irow[i]]; 97 kend = kstart + a->ilen[irow[i]]; 98 lens[i] = 0; 99 for (k=kstart; k<kend; k++) { 100 if (ssmap[aj[k]]) lens[i]++; 101 } 102 } 103 /* Create and fill new matrix */ 104 if (scall == MAT_REUSE_MATRIX) { 105 c = (Mat_SeqBAIJ*)((*B)->data); 106 107 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 108 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 109 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 110 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 111 C = *B; 112 } else { 113 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 114 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 115 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 116 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 117 } 118 c = (Mat_SeqBAIJ*)(C->data); 119 for (i=0; i<nrows; i++) { 120 row = irow[i]; 121 kstart = ai[row]; 122 kend = kstart + a->ilen[row]; 123 mat_i = c->i[i]; 124 mat_j = c->j + mat_i; 125 mat_a = c->a + mat_i*bs2; 126 mat_ilen = c->ilen + i; 127 for (k=kstart; k<kend; k++) { 128 if ((tcol=ssmap[a->j[k]])) { 129 *mat_j++ = tcol - 1; 130 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 131 mat_a += bs2; 132 (*mat_ilen)++; 133 } 134 } 135 } 136 /* sort */ 137 { 138 MatScalar *work; 139 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 140 for (i=0; i<nrows; i++) { 141 PetscInt ilen; 142 mat_i = c->i[i]; 143 mat_j = c->j + mat_i; 144 mat_a = c->a + mat_i*bs2; 145 ilen = c->ilen[i]; 146 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 147 } 148 ierr = PetscFree(work);CHKERRQ(ierr); 149 } 150 151 /* Free work space */ 152 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 153 ierr = PetscFree(smap);CHKERRQ(ierr); 154 ierr = PetscFree(lens);CHKERRQ(ierr); 155 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 157 158 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 159 *B = C; 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 maxmnbs = PetscMax(a->mbs,a->nbs); 180 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 181 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 182 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 } 186 count = 0; 187 for (i=0; i<nrows; i++) { 188 j = irow[i] / bs; 189 if ((vary[j]--)==bs) iary[count++] = j; 190 } 191 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 192 193 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 194 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 195 for (i=0; i<a->nbs; i++) { 196 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 197 } 198 count = 0; 199 for (i=0; i<ncols; i++) { 200 j = icol[i] / bs; 201 if ((vary[j]--)==bs) iary[count++] = j; 202 } 203 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 206 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 207 208 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 209 ierr = ISDestroy(&is1);CHKERRQ(ierr); 210 ierr = ISDestroy(&is2);CHKERRQ(ierr); 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 215 { 216 PetscErrorCode ierr; 217 PetscInt i; 218 219 PetscFunctionBegin; 220 if (scall == MAT_INITIAL_MATRIX) { 221 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 222 } 223 224 for (i=0; i<n; i++) { 225 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 226 } 227 PetscFunctionReturn(0); 228 } 229 230 231 /* -------------------------------------------------------*/ 232 /* Should check that shapes of vectors and matrices match */ 233 /* -------------------------------------------------------*/ 234 235 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 236 { 237 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 238 PetscScalar *z,sum; 239 const PetscScalar *x; 240 const MatScalar *v; 241 PetscErrorCode ierr; 242 PetscInt mbs,i,n; 243 const PetscInt *idx,*ii,*ridx=NULL; 244 PetscBool usecprow=a->compressedrow.use; 245 246 PetscFunctionBegin; 247 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 248 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 249 250 if (usecprow) { 251 mbs = a->compressedrow.nrows; 252 ii = a->compressedrow.i; 253 ridx = a->compressedrow.rindex; 254 ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 255 } else { 256 mbs = a->mbs; 257 ii = a->i; 258 } 259 260 for (i=0; i<mbs; i++) { 261 n = ii[1] - ii[0]; 262 v = a->a + ii[0]; 263 idx = a->j + ii[0]; 264 ii++; 265 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 266 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 267 sum = 0.0; 268 PetscSparseDensePlusDot(sum,x,v,idx,n); 269 if (usecprow) { 270 z[ridx[i]] = sum; 271 } else { 272 z[i] = sum; 273 } 274 } 275 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 276 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 277 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 284 PetscScalar *z = 0,sum1,sum2,*zarray; 285 const PetscScalar *x,*xb; 286 PetscScalar x1,x2; 287 const MatScalar *v; 288 PetscErrorCode ierr; 289 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 290 PetscBool usecprow=a->compressedrow.use; 291 292 PetscFunctionBegin; 293 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 294 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 295 296 idx = a->j; 297 v = a->a; 298 if (usecprow) { 299 mbs = a->compressedrow.nrows; 300 ii = a->compressedrow.i; 301 ridx = a->compressedrow.rindex; 302 ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 303 } else { 304 mbs = a->mbs; 305 ii = a->i; 306 z = zarray; 307 } 308 309 for (i=0; i<mbs; i++) { 310 n = ii[1] - ii[0]; ii++; 311 sum1 = 0.0; sum2 = 0.0; 312 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 313 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 314 for (j=0; j<n; j++) { 315 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 316 sum1 += v[0]*x1 + v[2]*x2; 317 sum2 += v[1]*x1 + v[3]*x2; 318 v += 4; 319 } 320 if (usecprow) z = zarray + 2*ridx[i]; 321 z[0] = sum1; z[1] = sum2; 322 if (!usecprow) z += 2; 323 } 324 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 325 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 326 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 327 PetscFunctionReturn(0); 328 } 329 330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 331 { 332 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 333 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 334 const PetscScalar *x,*xb; 335 const MatScalar *v; 336 PetscErrorCode ierr; 337 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 338 PetscBool usecprow=a->compressedrow.use; 339 340 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 341 #pragma disjoint(*v,*z,*xb) 342 #endif 343 344 PetscFunctionBegin; 345 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 346 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 347 348 idx = a->j; 349 v = a->a; 350 if (usecprow) { 351 mbs = a->compressedrow.nrows; 352 ii = a->compressedrow.i; 353 ridx = a->compressedrow.rindex; 354 ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 355 } else { 356 mbs = a->mbs; 357 ii = a->i; 358 z = zarray; 359 } 360 361 for (i=0; i<mbs; i++) { 362 n = ii[1] - ii[0]; ii++; 363 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 364 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 365 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 366 for (j=0; j<n; j++) { 367 xb = x + 3*(*idx++); 368 x1 = xb[0]; 369 x2 = xb[1]; 370 x3 = xb[2]; 371 372 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 373 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 374 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 375 v += 9; 376 } 377 if (usecprow) z = zarray + 3*ridx[i]; 378 z[0] = sum1; z[1] = sum2; z[2] = sum3; 379 if (!usecprow) z += 3; 380 } 381 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 382 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 383 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 384 PetscFunctionReturn(0); 385 } 386 387 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 388 { 389 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 390 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 391 const PetscScalar *x,*xb; 392 const MatScalar *v; 393 PetscErrorCode ierr; 394 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 395 PetscBool usecprow=a->compressedrow.use; 396 397 PetscFunctionBegin; 398 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 399 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 400 401 idx = a->j; 402 v = a->a; 403 if (usecprow) { 404 mbs = a->compressedrow.nrows; 405 ii = a->compressedrow.i; 406 ridx = a->compressedrow.rindex; 407 ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 408 } else { 409 mbs = a->mbs; 410 ii = a->i; 411 z = zarray; 412 } 413 414 for (i=0; i<mbs; i++) { 415 n = ii[1] - ii[0]; 416 ii++; 417 sum1 = 0.0; 418 sum2 = 0.0; 419 sum3 = 0.0; 420 sum4 = 0.0; 421 422 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 423 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 424 for (j=0; j<n; j++) { 425 xb = x + 4*(*idx++); 426 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 427 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 428 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 429 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 430 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 431 v += 16; 432 } 433 if (usecprow) z = zarray + 4*ridx[i]; 434 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 435 if (!usecprow) z += 4; 436 } 437 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 438 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 439 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 440 PetscFunctionReturn(0); 441 } 442 443 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 444 { 445 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 446 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 447 const PetscScalar *xb,*x; 448 const MatScalar *v; 449 PetscErrorCode ierr; 450 const PetscInt *idx,*ii,*ridx=NULL; 451 PetscInt mbs,i,j,n; 452 PetscBool usecprow=a->compressedrow.use; 453 454 PetscFunctionBegin; 455 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 456 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 457 458 idx = a->j; 459 v = a->a; 460 if (usecprow) { 461 mbs = a->compressedrow.nrows; 462 ii = a->compressedrow.i; 463 ridx = a->compressedrow.rindex; 464 ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 465 } else { 466 mbs = a->mbs; 467 ii = a->i; 468 z = zarray; 469 } 470 471 for (i=0; i<mbs; i++) { 472 n = ii[1] - ii[0]; ii++; 473 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 474 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 475 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 476 for (j=0; j<n; j++) { 477 xb = x + 5*(*idx++); 478 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 479 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 480 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 481 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 482 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 483 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 484 v += 25; 485 } 486 if (usecprow) z = zarray + 5*ridx[i]; 487 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 488 if (!usecprow) z += 5; 489 } 490 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 491 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 492 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 493 PetscFunctionReturn(0); 494 } 495 496 497 498 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 499 { 500 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 501 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 502 const PetscScalar *x,*xb; 503 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 504 const MatScalar *v; 505 PetscErrorCode ierr; 506 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 507 PetscBool usecprow=a->compressedrow.use; 508 509 PetscFunctionBegin; 510 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 511 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 512 513 idx = a->j; 514 v = a->a; 515 if (usecprow) { 516 mbs = a->compressedrow.nrows; 517 ii = a->compressedrow.i; 518 ridx = a->compressedrow.rindex; 519 ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 520 } else { 521 mbs = a->mbs; 522 ii = a->i; 523 z = zarray; 524 } 525 526 for (i=0; i<mbs; i++) { 527 n = ii[1] - ii[0]; 528 ii++; 529 sum1 = 0.0; 530 sum2 = 0.0; 531 sum3 = 0.0; 532 sum4 = 0.0; 533 sum5 = 0.0; 534 sum6 = 0.0; 535 536 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 537 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 538 for (j=0; j<n; j++) { 539 xb = x + 6*(*idx++); 540 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 541 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 542 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 543 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 544 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 545 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 546 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 547 v += 36; 548 } 549 if (usecprow) z = zarray + 6*ridx[i]; 550 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 551 if (!usecprow) z += 6; 552 } 553 554 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 555 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 556 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 557 PetscFunctionReturn(0); 558 } 559 560 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 561 { 562 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 563 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 564 const PetscScalar *x,*xb; 565 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 566 const MatScalar *v; 567 PetscErrorCode ierr; 568 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 569 PetscBool usecprow=a->compressedrow.use; 570 571 PetscFunctionBegin; 572 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 573 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 574 575 idx = a->j; 576 v = a->a; 577 if (usecprow) { 578 mbs = a->compressedrow.nrows; 579 ii = a->compressedrow.i; 580 ridx = a->compressedrow.rindex; 581 ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 582 } else { 583 mbs = a->mbs; 584 ii = a->i; 585 z = zarray; 586 } 587 588 for (i=0; i<mbs; i++) { 589 n = ii[1] - ii[0]; 590 ii++; 591 sum1 = 0.0; 592 sum2 = 0.0; 593 sum3 = 0.0; 594 sum4 = 0.0; 595 sum5 = 0.0; 596 sum6 = 0.0; 597 sum7 = 0.0; 598 599 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 600 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 601 for (j=0; j<n; j++) { 602 xb = x + 7*(*idx++); 603 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 604 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 605 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 606 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 607 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 608 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 609 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 610 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 611 v += 49; 612 } 613 if (usecprow) z = zarray + 7*ridx[i]; 614 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 615 if (!usecprow) z += 7; 616 } 617 618 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 619 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 620 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 621 PetscFunctionReturn(0); 622 } 623 624 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 625 /* Default MatMult for block size 15 */ 626 627 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 628 { 629 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 630 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 631 const PetscScalar *x,*xb; 632 PetscScalar *zarray,xv; 633 const MatScalar *v; 634 PetscErrorCode ierr; 635 const PetscInt *ii,*ij=a->j,*idx; 636 PetscInt mbs,i,j,k,n,*ridx=NULL; 637 PetscBool usecprow=a->compressedrow.use; 638 639 PetscFunctionBegin; 640 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 641 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 642 643 v = a->a; 644 if (usecprow) { 645 mbs = a->compressedrow.nrows; 646 ii = a->compressedrow.i; 647 ridx = a->compressedrow.rindex; 648 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 649 } else { 650 mbs = a->mbs; 651 ii = a->i; 652 z = zarray; 653 } 654 655 for (i=0; i<mbs; i++) { 656 n = ii[i+1] - ii[i]; 657 idx = ij + ii[i]; 658 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 659 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 660 661 for (j=0; j<n; j++) { 662 xb = x + 15*(idx[j]); 663 664 for (k=0; k<15; k++) { 665 xv = xb[k]; 666 sum1 += v[0]*xv; 667 sum2 += v[1]*xv; 668 sum3 += v[2]*xv; 669 sum4 += v[3]*xv; 670 sum5 += v[4]*xv; 671 sum6 += v[5]*xv; 672 sum7 += v[6]*xv; 673 sum8 += v[7]*xv; 674 sum9 += v[8]*xv; 675 sum10 += v[9]*xv; 676 sum11 += v[10]*xv; 677 sum12 += v[11]*xv; 678 sum13 += v[12]*xv; 679 sum14 += v[13]*xv; 680 sum15 += v[14]*xv; 681 v += 15; 682 } 683 } 684 if (usecprow) z = zarray + 15*ridx[i]; 685 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 686 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 687 688 if (!usecprow) z += 15; 689 } 690 691 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 692 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 693 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 698 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 699 { 700 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 701 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 702 const PetscScalar *x,*xb; 703 PetscScalar x1,x2,x3,x4,*zarray; 704 const MatScalar *v; 705 PetscErrorCode ierr; 706 const PetscInt *ii,*ij=a->j,*idx; 707 PetscInt mbs,i,j,n,*ridx=NULL; 708 PetscBool usecprow=a->compressedrow.use; 709 710 PetscFunctionBegin; 711 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 712 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 713 714 v = a->a; 715 if (usecprow) { 716 mbs = a->compressedrow.nrows; 717 ii = a->compressedrow.i; 718 ridx = a->compressedrow.rindex; 719 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 720 } else { 721 mbs = a->mbs; 722 ii = a->i; 723 z = zarray; 724 } 725 726 for (i=0; i<mbs; i++) { 727 n = ii[i+1] - ii[i]; 728 idx = ij + ii[i]; 729 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 730 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 731 732 for (j=0; j<n; j++) { 733 xb = x + 15*(idx[j]); 734 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 735 736 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 737 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 738 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 739 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 740 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 741 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 742 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 743 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 744 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 745 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 746 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 747 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 748 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 749 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 750 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 751 752 v += 60; 753 754 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 755 756 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 757 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 758 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 759 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 760 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 761 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 762 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 763 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 764 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 765 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 766 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 767 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 768 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 769 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 770 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 771 v += 60; 772 773 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 774 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 775 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 776 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 777 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 778 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 779 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 780 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 781 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 782 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 783 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 784 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 785 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 786 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 787 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 788 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 789 v += 60; 790 791 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 792 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 793 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 794 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 795 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 796 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 797 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 798 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 799 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 800 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 801 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 802 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 803 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 804 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 805 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 806 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 807 v += 45; 808 } 809 if (usecprow) z = zarray + 15*ridx[i]; 810 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 811 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 812 813 if (!usecprow) z += 15; 814 } 815 816 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 817 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 818 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 819 PetscFunctionReturn(0); 820 } 821 822 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 823 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 824 { 825 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 826 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 827 const PetscScalar *x,*xb; 828 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 829 const MatScalar *v; 830 PetscErrorCode ierr; 831 const PetscInt *ii,*ij=a->j,*idx; 832 PetscInt mbs,i,j,n,*ridx=NULL; 833 PetscBool usecprow=a->compressedrow.use; 834 835 PetscFunctionBegin; 836 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 837 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 838 839 v = a->a; 840 if (usecprow) { 841 mbs = a->compressedrow.nrows; 842 ii = a->compressedrow.i; 843 ridx = a->compressedrow.rindex; 844 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 845 } else { 846 mbs = a->mbs; 847 ii = a->i; 848 z = zarray; 849 } 850 851 for (i=0; i<mbs; i++) { 852 n = ii[i+1] - ii[i]; 853 idx = ij + ii[i]; 854 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 855 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 856 857 for (j=0; j<n; j++) { 858 xb = x + 15*(idx[j]); 859 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 860 x8 = xb[7]; 861 862 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 863 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 864 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 865 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 866 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 867 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 868 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 869 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 870 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 871 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 872 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 873 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 874 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 875 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 876 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 877 v += 120; 878 879 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 880 881 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 882 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 883 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 884 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 885 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 886 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 887 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 888 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 889 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 890 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 891 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 892 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 893 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 894 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 895 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 896 v += 105; 897 } 898 if (usecprow) z = zarray + 15*ridx[i]; 899 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 900 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 901 902 if (!usecprow) z += 15; 903 } 904 905 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 906 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 907 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 908 PetscFunctionReturn(0); 909 } 910 911 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 912 913 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 914 { 915 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 916 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 917 const PetscScalar *x,*xb; 918 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 919 const MatScalar *v; 920 PetscErrorCode ierr; 921 const PetscInt *ii,*ij=a->j,*idx; 922 PetscInt mbs,i,j,n,*ridx=NULL; 923 PetscBool usecprow=a->compressedrow.use; 924 925 PetscFunctionBegin; 926 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 927 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 928 929 v = a->a; 930 if (usecprow) { 931 mbs = a->compressedrow.nrows; 932 ii = a->compressedrow.i; 933 ridx = a->compressedrow.rindex; 934 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 935 } else { 936 mbs = a->mbs; 937 ii = a->i; 938 z = zarray; 939 } 940 941 for (i=0; i<mbs; i++) { 942 n = ii[i+1] - ii[i]; 943 idx = ij + ii[i]; 944 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 945 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 946 947 for (j=0; j<n; j++) { 948 xb = x + 15*(idx[j]); 949 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 950 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 951 952 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 953 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 954 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 955 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 956 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 957 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 958 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 959 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 960 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 961 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 962 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 963 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 964 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 965 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 966 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 967 v += 225; 968 } 969 if (usecprow) z = zarray + 15*ridx[i]; 970 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 971 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 972 973 if (!usecprow) z += 15; 974 } 975 976 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 977 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 978 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 979 PetscFunctionReturn(0); 980 } 981 982 983 /* 984 This will not work with MatScalar == float because it calls the BLAS 985 */ 986 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 987 { 988 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 989 PetscScalar *z = 0,*work,*workt,*zarray; 990 const PetscScalar *x,*xb; 991 const MatScalar *v; 992 PetscErrorCode ierr; 993 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 994 const PetscInt *idx,*ii,*ridx=NULL; 995 PetscInt ncols,k; 996 PetscBool usecprow=a->compressedrow.use; 997 998 PetscFunctionBegin; 999 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1000 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1001 1002 idx = a->j; 1003 v = a->a; 1004 if (usecprow) { 1005 mbs = a->compressedrow.nrows; 1006 ii = a->compressedrow.i; 1007 ridx = a->compressedrow.rindex; 1008 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1009 } else { 1010 mbs = a->mbs; 1011 ii = a->i; 1012 z = zarray; 1013 } 1014 1015 if (!a->mult_work) { 1016 k = PetscMax(A->rmap->n,A->cmap->n); 1017 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1018 } 1019 work = a->mult_work; 1020 for (i=0; i<mbs; i++) { 1021 n = ii[1] - ii[0]; ii++; 1022 ncols = n*bs; 1023 workt = work; 1024 for (j=0; j<n; j++) { 1025 xb = x + bs*(*idx++); 1026 for (k=0; k<bs; k++) workt[k] = xb[k]; 1027 workt += bs; 1028 } 1029 if (usecprow) z = zarray + bs*ridx[i]; 1030 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1031 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1032 v += n*bs2; 1033 if (!usecprow) z += bs; 1034 } 1035 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1036 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1037 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1038 PetscFunctionReturn(0); 1039 } 1040 1041 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1042 { 1043 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1044 const PetscScalar *x; 1045 PetscScalar *y,*z,sum; 1046 const MatScalar *v; 1047 PetscErrorCode ierr; 1048 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1049 const PetscInt *idx,*ii; 1050 PetscBool usecprow=a->compressedrow.use; 1051 1052 PetscFunctionBegin; 1053 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1054 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1055 1056 idx = a->j; 1057 v = a->a; 1058 if (usecprow) { 1059 if (zz != yy) { 1060 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1061 } 1062 mbs = a->compressedrow.nrows; 1063 ii = a->compressedrow.i; 1064 ridx = a->compressedrow.rindex; 1065 } else { 1066 ii = a->i; 1067 } 1068 1069 for (i=0; i<mbs; i++) { 1070 n = ii[1] - ii[0]; 1071 ii++; 1072 if (!usecprow) { 1073 sum = y[i]; 1074 } else { 1075 sum = y[ridx[i]]; 1076 } 1077 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1078 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1079 PetscSparseDensePlusDot(sum,x,v,idx,n); 1080 v += n; 1081 idx += n; 1082 if (usecprow) { 1083 z[ridx[i]] = sum; 1084 } else { 1085 z[i] = sum; 1086 } 1087 } 1088 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1089 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1090 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1091 PetscFunctionReturn(0); 1092 } 1093 1094 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1095 { 1096 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1097 PetscScalar *y = 0,*z = 0,sum1,sum2; 1098 const PetscScalar *x,*xb; 1099 PetscScalar x1,x2,*yarray,*zarray; 1100 const MatScalar *v; 1101 PetscErrorCode ierr; 1102 PetscInt mbs = a->mbs,i,n,j; 1103 const PetscInt *idx,*ii,*ridx = NULL; 1104 PetscBool usecprow = a->compressedrow.use; 1105 1106 PetscFunctionBegin; 1107 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1108 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1109 1110 idx = a->j; 1111 v = a->a; 1112 if (usecprow) { 1113 if (zz != yy) { 1114 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1115 } 1116 mbs = a->compressedrow.nrows; 1117 ii = a->compressedrow.i; 1118 ridx = a->compressedrow.rindex; 1119 } else { 1120 ii = a->i; 1121 y = yarray; 1122 z = zarray; 1123 } 1124 1125 for (i=0; i<mbs; i++) { 1126 n = ii[1] - ii[0]; ii++; 1127 if (usecprow) { 1128 z = zarray + 2*ridx[i]; 1129 y = yarray + 2*ridx[i]; 1130 } 1131 sum1 = y[0]; sum2 = y[1]; 1132 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1133 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1134 for (j=0; j<n; j++) { 1135 xb = x + 2*(*idx++); 1136 x1 = xb[0]; 1137 x2 = xb[1]; 1138 1139 sum1 += v[0]*x1 + v[2]*x2; 1140 sum2 += v[1]*x1 + v[3]*x2; 1141 v += 4; 1142 } 1143 z[0] = sum1; z[1] = sum2; 1144 if (!usecprow) { 1145 z += 2; y += 2; 1146 } 1147 } 1148 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1149 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1150 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1151 PetscFunctionReturn(0); 1152 } 1153 1154 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1155 { 1156 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1157 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1158 const PetscScalar *x,*xb; 1159 const MatScalar *v; 1160 PetscErrorCode ierr; 1161 PetscInt mbs = a->mbs,i,j,n; 1162 const PetscInt *idx,*ii,*ridx = NULL; 1163 PetscBool usecprow = a->compressedrow.use; 1164 1165 PetscFunctionBegin; 1166 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1167 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1168 1169 idx = a->j; 1170 v = a->a; 1171 if (usecprow) { 1172 if (zz != yy) { 1173 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1174 } 1175 mbs = a->compressedrow.nrows; 1176 ii = a->compressedrow.i; 1177 ridx = a->compressedrow.rindex; 1178 } else { 1179 ii = a->i; 1180 y = yarray; 1181 z = zarray; 1182 } 1183 1184 for (i=0; i<mbs; i++) { 1185 n = ii[1] - ii[0]; ii++; 1186 if (usecprow) { 1187 z = zarray + 3*ridx[i]; 1188 y = yarray + 3*ridx[i]; 1189 } 1190 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1191 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1192 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1193 for (j=0; j<n; j++) { 1194 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1195 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1196 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1197 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1198 v += 9; 1199 } 1200 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1201 if (!usecprow) { 1202 z += 3; y += 3; 1203 } 1204 } 1205 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1206 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1207 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1208 PetscFunctionReturn(0); 1209 } 1210 1211 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1212 { 1213 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1214 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1215 const PetscScalar *x,*xb; 1216 const MatScalar *v; 1217 PetscErrorCode ierr; 1218 PetscInt mbs = a->mbs,i,j,n; 1219 const PetscInt *idx,*ii,*ridx=NULL; 1220 PetscBool usecprow=a->compressedrow.use; 1221 1222 PetscFunctionBegin; 1223 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1224 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1225 1226 idx = a->j; 1227 v = a->a; 1228 if (usecprow) { 1229 if (zz != yy) { 1230 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1231 } 1232 mbs = a->compressedrow.nrows; 1233 ii = a->compressedrow.i; 1234 ridx = a->compressedrow.rindex; 1235 } else { 1236 ii = a->i; 1237 y = yarray; 1238 z = zarray; 1239 } 1240 1241 for (i=0; i<mbs; i++) { 1242 n = ii[1] - ii[0]; ii++; 1243 if (usecprow) { 1244 z = zarray + 4*ridx[i]; 1245 y = yarray + 4*ridx[i]; 1246 } 1247 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1248 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1249 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1250 for (j=0; j<n; j++) { 1251 xb = x + 4*(*idx++); 1252 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1253 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1254 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1255 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1256 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1257 v += 16; 1258 } 1259 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1260 if (!usecprow) { 1261 z += 4; y += 4; 1262 } 1263 } 1264 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1265 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1266 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1267 PetscFunctionReturn(0); 1268 } 1269 1270 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1271 { 1272 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1273 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1274 const PetscScalar *x,*xb; 1275 PetscScalar *yarray,*zarray; 1276 const MatScalar *v; 1277 PetscErrorCode ierr; 1278 PetscInt mbs = a->mbs,i,j,n; 1279 const PetscInt *idx,*ii,*ridx = NULL; 1280 PetscBool usecprow=a->compressedrow.use; 1281 1282 PetscFunctionBegin; 1283 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1284 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1285 1286 idx = a->j; 1287 v = a->a; 1288 if (usecprow) { 1289 if (zz != yy) { 1290 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1291 } 1292 mbs = a->compressedrow.nrows; 1293 ii = a->compressedrow.i; 1294 ridx = a->compressedrow.rindex; 1295 } else { 1296 ii = a->i; 1297 y = yarray; 1298 z = zarray; 1299 } 1300 1301 for (i=0; i<mbs; i++) { 1302 n = ii[1] - ii[0]; ii++; 1303 if (usecprow) { 1304 z = zarray + 5*ridx[i]; 1305 y = yarray + 5*ridx[i]; 1306 } 1307 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1308 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1309 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1310 for (j=0; j<n; j++) { 1311 xb = x + 5*(*idx++); 1312 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1313 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1314 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1315 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1316 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1317 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1318 v += 25; 1319 } 1320 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1321 if (!usecprow) { 1322 z += 5; y += 5; 1323 } 1324 } 1325 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1326 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1327 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1328 PetscFunctionReturn(0); 1329 } 1330 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1331 { 1332 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1333 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1334 const PetscScalar *x,*xb; 1335 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1336 const MatScalar *v; 1337 PetscErrorCode ierr; 1338 PetscInt mbs = a->mbs,i,j,n; 1339 const PetscInt *idx,*ii,*ridx=NULL; 1340 PetscBool usecprow=a->compressedrow.use; 1341 1342 PetscFunctionBegin; 1343 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1344 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1345 1346 idx = a->j; 1347 v = a->a; 1348 if (usecprow) { 1349 if (zz != yy) { 1350 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1351 } 1352 mbs = a->compressedrow.nrows; 1353 ii = a->compressedrow.i; 1354 ridx = a->compressedrow.rindex; 1355 } else { 1356 ii = a->i; 1357 y = yarray; 1358 z = zarray; 1359 } 1360 1361 for (i=0; i<mbs; i++) { 1362 n = ii[1] - ii[0]; ii++; 1363 if (usecprow) { 1364 z = zarray + 6*ridx[i]; 1365 y = yarray + 6*ridx[i]; 1366 } 1367 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1368 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1369 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1370 for (j=0; j<n; j++) { 1371 xb = x + 6*(*idx++); 1372 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1373 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1374 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1375 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1376 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1377 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1378 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1379 v += 36; 1380 } 1381 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1382 if (!usecprow) { 1383 z += 6; y += 6; 1384 } 1385 } 1386 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1387 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1388 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1393 { 1394 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1395 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1396 const PetscScalar *x,*xb; 1397 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1398 const MatScalar *v; 1399 PetscErrorCode ierr; 1400 PetscInt mbs = a->mbs,i,j,n; 1401 const PetscInt *idx,*ii,*ridx = NULL; 1402 PetscBool usecprow=a->compressedrow.use; 1403 1404 PetscFunctionBegin; 1405 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1406 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1407 1408 idx = a->j; 1409 v = a->a; 1410 if (usecprow) { 1411 if (zz != yy) { 1412 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1413 } 1414 mbs = a->compressedrow.nrows; 1415 ii = a->compressedrow.i; 1416 ridx = a->compressedrow.rindex; 1417 } else { 1418 ii = a->i; 1419 y = yarray; 1420 z = zarray; 1421 } 1422 1423 for (i=0; i<mbs; i++) { 1424 n = ii[1] - ii[0]; ii++; 1425 if (usecprow) { 1426 z = zarray + 7*ridx[i]; 1427 y = yarray + 7*ridx[i]; 1428 } 1429 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1430 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1431 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1432 for (j=0; j<n; j++) { 1433 xb = x + 7*(*idx++); 1434 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1435 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1436 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1437 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1438 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1439 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1440 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1441 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1442 v += 49; 1443 } 1444 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1445 if (!usecprow) { 1446 z += 7; y += 7; 1447 } 1448 } 1449 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1450 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1451 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1452 PetscFunctionReturn(0); 1453 } 1454 1455 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1456 { 1457 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1458 PetscScalar *z = 0,*work,*workt,*zarray; 1459 const PetscScalar *x,*xb; 1460 const MatScalar *v; 1461 PetscErrorCode ierr; 1462 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1463 PetscInt ncols,k; 1464 const PetscInt *ridx = NULL,*idx,*ii; 1465 PetscBool usecprow = a->compressedrow.use; 1466 1467 PetscFunctionBegin; 1468 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1469 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1470 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1471 1472 idx = a->j; 1473 v = a->a; 1474 if (usecprow) { 1475 mbs = a->compressedrow.nrows; 1476 ii = a->compressedrow.i; 1477 ridx = a->compressedrow.rindex; 1478 } else { 1479 mbs = a->mbs; 1480 ii = a->i; 1481 z = zarray; 1482 } 1483 1484 if (!a->mult_work) { 1485 k = PetscMax(A->rmap->n,A->cmap->n); 1486 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1487 } 1488 work = a->mult_work; 1489 for (i=0; i<mbs; i++) { 1490 n = ii[1] - ii[0]; ii++; 1491 ncols = n*bs; 1492 workt = work; 1493 for (j=0; j<n; j++) { 1494 xb = x + bs*(*idx++); 1495 for (k=0; k<bs; k++) workt[k] = xb[k]; 1496 workt += bs; 1497 } 1498 if (usecprow) z = zarray + bs*ridx[i]; 1499 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1500 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1501 v += n*bs2; 1502 if (!usecprow) z += bs; 1503 } 1504 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1505 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1506 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1507 PetscFunctionReturn(0); 1508 } 1509 1510 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1511 { 1512 PetscScalar zero = 0.0; 1513 PetscErrorCode ierr; 1514 1515 PetscFunctionBegin; 1516 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1517 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1518 PetscFunctionReturn(0); 1519 } 1520 1521 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1522 { 1523 PetscScalar zero = 0.0; 1524 PetscErrorCode ierr; 1525 1526 PetscFunctionBegin; 1527 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1528 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1529 PetscFunctionReturn(0); 1530 } 1531 1532 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1533 { 1534 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1535 PetscScalar *z,x1,x2,x3,x4,x5; 1536 const PetscScalar *x,*xb = NULL; 1537 const MatScalar *v; 1538 PetscErrorCode ierr; 1539 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1540 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1541 Mat_CompressedRow cprow = a->compressedrow; 1542 PetscBool usecprow = cprow.use; 1543 1544 PetscFunctionBegin; 1545 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1546 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1547 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1548 1549 idx = a->j; 1550 v = a->a; 1551 if (usecprow) { 1552 mbs = cprow.nrows; 1553 ii = cprow.i; 1554 ridx = cprow.rindex; 1555 } else { 1556 mbs=a->mbs; 1557 ii = a->i; 1558 xb = x; 1559 } 1560 1561 switch (bs) { 1562 case 1: 1563 for (i=0; i<mbs; i++) { 1564 if (usecprow) xb = x + ridx[i]; 1565 x1 = xb[0]; 1566 ib = idx + ii[0]; 1567 n = ii[1] - ii[0]; ii++; 1568 for (j=0; j<n; j++) { 1569 rval = ib[j]; 1570 z[rval] += PetscConj(*v) * x1; 1571 v++; 1572 } 1573 if (!usecprow) xb++; 1574 } 1575 break; 1576 case 2: 1577 for (i=0; i<mbs; i++) { 1578 if (usecprow) xb = x + 2*ridx[i]; 1579 x1 = xb[0]; x2 = xb[1]; 1580 ib = idx + ii[0]; 1581 n = ii[1] - ii[0]; ii++; 1582 for (j=0; j<n; j++) { 1583 rval = ib[j]*2; 1584 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1585 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1586 v += 4; 1587 } 1588 if (!usecprow) xb += 2; 1589 } 1590 break; 1591 case 3: 1592 for (i=0; i<mbs; i++) { 1593 if (usecprow) xb = x + 3*ridx[i]; 1594 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1595 ib = idx + ii[0]; 1596 n = ii[1] - ii[0]; ii++; 1597 for (j=0; j<n; j++) { 1598 rval = ib[j]*3; 1599 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1600 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1601 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1602 v += 9; 1603 } 1604 if (!usecprow) xb += 3; 1605 } 1606 break; 1607 case 4: 1608 for (i=0; i<mbs; i++) { 1609 if (usecprow) xb = x + 4*ridx[i]; 1610 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1611 ib = idx + ii[0]; 1612 n = ii[1] - ii[0]; ii++; 1613 for (j=0; j<n; j++) { 1614 rval = ib[j]*4; 1615 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1616 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1617 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1618 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1619 v += 16; 1620 } 1621 if (!usecprow) xb += 4; 1622 } 1623 break; 1624 case 5: 1625 for (i=0; i<mbs; i++) { 1626 if (usecprow) xb = x + 5*ridx[i]; 1627 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1628 x4 = xb[3]; x5 = xb[4]; 1629 ib = idx + ii[0]; 1630 n = ii[1] - ii[0]; ii++; 1631 for (j=0; j<n; j++) { 1632 rval = ib[j]*5; 1633 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1634 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1635 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1636 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1637 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1638 v += 25; 1639 } 1640 if (!usecprow) xb += 5; 1641 } 1642 break; 1643 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1644 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1645 #if 0 1646 { 1647 PetscInt ncols,k,bs2=a->bs2; 1648 PetscScalar *work,*workt,zb; 1649 const PetscScalar *xtmp; 1650 if (!a->mult_work) { 1651 k = PetscMax(A->rmap->n,A->cmap->n); 1652 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1653 } 1654 work = a->mult_work; 1655 xtmp = x; 1656 for (i=0; i<mbs; i++) { 1657 n = ii[1] - ii[0]; ii++; 1658 ncols = n*bs; 1659 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1660 if (usecprow) xtmp = x + bs*ridx[i]; 1661 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1662 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1663 v += n*bs2; 1664 if (!usecprow) xtmp += bs; 1665 workt = work; 1666 for (j=0; j<n; j++) { 1667 zb = z + bs*(*idx++); 1668 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1669 workt += bs; 1670 } 1671 } 1672 } 1673 #endif 1674 } 1675 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1676 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1677 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1678 PetscFunctionReturn(0); 1679 } 1680 1681 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1682 { 1683 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1684 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 1685 const PetscScalar *x,*xb = 0; 1686 const MatScalar *v; 1687 PetscErrorCode ierr; 1688 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 1689 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1690 Mat_CompressedRow cprow = a->compressedrow; 1691 PetscBool usecprow=cprow.use; 1692 1693 PetscFunctionBegin; 1694 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1695 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1696 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1697 1698 idx = a->j; 1699 v = a->a; 1700 if (usecprow) { 1701 mbs = cprow.nrows; 1702 ii = cprow.i; 1703 ridx = cprow.rindex; 1704 } else { 1705 mbs=a->mbs; 1706 ii = a->i; 1707 xb = x; 1708 } 1709 1710 switch (bs) { 1711 case 1: 1712 for (i=0; i<mbs; i++) { 1713 if (usecprow) xb = x + ridx[i]; 1714 x1 = xb[0]; 1715 ib = idx + ii[0]; 1716 n = ii[1] - ii[0]; ii++; 1717 for (j=0; j<n; j++) { 1718 rval = ib[j]; 1719 z[rval] += *v * x1; 1720 v++; 1721 } 1722 if (!usecprow) xb++; 1723 } 1724 break; 1725 case 2: 1726 for (i=0; i<mbs; i++) { 1727 if (usecprow) xb = x + 2*ridx[i]; 1728 x1 = xb[0]; x2 = xb[1]; 1729 ib = idx + ii[0]; 1730 n = ii[1] - ii[0]; ii++; 1731 for (j=0; j<n; j++) { 1732 rval = ib[j]*2; 1733 z[rval++] += v[0]*x1 + v[1]*x2; 1734 z[rval++] += v[2]*x1 + v[3]*x2; 1735 v += 4; 1736 } 1737 if (!usecprow) xb += 2; 1738 } 1739 break; 1740 case 3: 1741 for (i=0; i<mbs; i++) { 1742 if (usecprow) xb = x + 3*ridx[i]; 1743 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1744 ib = idx + ii[0]; 1745 n = ii[1] - ii[0]; ii++; 1746 for (j=0; j<n; j++) { 1747 rval = ib[j]*3; 1748 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1749 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1750 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1751 v += 9; 1752 } 1753 if (!usecprow) xb += 3; 1754 } 1755 break; 1756 case 4: 1757 for (i=0; i<mbs; i++) { 1758 if (usecprow) xb = x + 4*ridx[i]; 1759 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1760 ib = idx + ii[0]; 1761 n = ii[1] - ii[0]; ii++; 1762 for (j=0; j<n; j++) { 1763 rval = ib[j]*4; 1764 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1765 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1766 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1767 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1768 v += 16; 1769 } 1770 if (!usecprow) xb += 4; 1771 } 1772 break; 1773 case 5: 1774 for (i=0; i<mbs; i++) { 1775 if (usecprow) xb = x + 5*ridx[i]; 1776 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1777 x4 = xb[3]; x5 = xb[4]; 1778 ib = idx + ii[0]; 1779 n = ii[1] - ii[0]; ii++; 1780 for (j=0; j<n; j++) { 1781 rval = ib[j]*5; 1782 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1783 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1784 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1785 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1786 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1787 v += 25; 1788 } 1789 if (!usecprow) xb += 5; 1790 } 1791 break; 1792 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1793 PetscInt ncols,k; 1794 PetscScalar *work,*workt; 1795 const PetscScalar *xtmp; 1796 if (!a->mult_work) { 1797 k = PetscMax(A->rmap->n,A->cmap->n); 1798 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1799 } 1800 work = a->mult_work; 1801 xtmp = x; 1802 for (i=0; i<mbs; i++) { 1803 n = ii[1] - ii[0]; ii++; 1804 ncols = n*bs; 1805 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1806 if (usecprow) xtmp = x + bs*ridx[i]; 1807 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1808 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1809 v += n*bs2; 1810 if (!usecprow) xtmp += bs; 1811 workt = work; 1812 for (j=0; j<n; j++) { 1813 zb = z + bs*(*idx++); 1814 for (k=0; k<bs; k++) zb[k] += workt[k]; 1815 workt += bs; 1816 } 1817 } 1818 } 1819 } 1820 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1821 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1822 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1823 PetscFunctionReturn(0); 1824 } 1825 1826 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1827 { 1828 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1829 PetscInt totalnz = a->bs2*a->nz; 1830 PetscScalar oalpha = alpha; 1831 PetscErrorCode ierr; 1832 PetscBLASInt one = 1,tnz; 1833 1834 PetscFunctionBegin; 1835 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1836 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1837 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1838 PetscFunctionReturn(0); 1839 } 1840 1841 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1842 { 1843 PetscErrorCode ierr; 1844 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1845 MatScalar *v = a->a; 1846 PetscReal sum = 0.0; 1847 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1848 1849 PetscFunctionBegin; 1850 if (type == NORM_FROBENIUS) { 1851 #if defined(PETSC_USE_REAL___FP16) 1852 PetscBLASInt one = 1,cnt = bs2*nz; 1853 *norm = BLASnrm2_(&cnt,v,&one); 1854 #else 1855 for (i=0; i<bs2*nz; i++) { 1856 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1857 } 1858 #endif 1859 *norm = PetscSqrtReal(sum); 1860 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 1861 } else if (type == NORM_1) { /* maximum column sum */ 1862 PetscReal *tmp; 1863 PetscInt *bcol = a->j; 1864 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 1865 for (i=0; i<nz; i++) { 1866 for (j=0; j<bs; j++) { 1867 k1 = bs*(*bcol) + j; /* column index */ 1868 for (k=0; k<bs; k++) { 1869 tmp[k1] += PetscAbsScalar(*v); v++; 1870 } 1871 } 1872 bcol++; 1873 } 1874 *norm = 0.0; 1875 for (j=0; j<A->cmap->n; j++) { 1876 if (tmp[j] > *norm) *norm = tmp[j]; 1877 } 1878 ierr = PetscFree(tmp);CHKERRQ(ierr); 1879 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 1880 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1881 *norm = 0.0; 1882 for (k=0; k<bs; k++) { 1883 for (j=0; j<a->mbs; j++) { 1884 v = a->a + bs2*a->i[j] + k; 1885 sum = 0.0; 1886 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1887 for (k1=0; k1<bs; k1++) { 1888 sum += PetscAbsScalar(*v); 1889 v += bs; 1890 } 1891 } 1892 if (sum > *norm) *norm = sum; 1893 } 1894 } 1895 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 1896 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1897 PetscFunctionReturn(0); 1898 } 1899 1900 1901 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 1902 { 1903 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 1904 PetscErrorCode ierr; 1905 1906 PetscFunctionBegin; 1907 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1908 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1909 *flg = PETSC_FALSE; 1910 PetscFunctionReturn(0); 1911 } 1912 1913 /* if the a->i are the same */ 1914 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1915 if (!*flg) PetscFunctionReturn(0); 1916 1917 /* if a->j are the same */ 1918 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1919 if (!*flg) PetscFunctionReturn(0); 1920 1921 /* if a->a are the same */ 1922 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1923 PetscFunctionReturn(0); 1924 1925 } 1926 1927 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1928 { 1929 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1930 PetscErrorCode ierr; 1931 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1932 PetscScalar *x,zero = 0.0; 1933 MatScalar *aa,*aa_j; 1934 1935 PetscFunctionBegin; 1936 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1937 bs = A->rmap->bs; 1938 aa = a->a; 1939 ai = a->i; 1940 aj = a->j; 1941 ambs = a->mbs; 1942 bs2 = a->bs2; 1943 1944 ierr = VecSet(v,zero);CHKERRQ(ierr); 1945 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1946 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1947 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1948 for (i=0; i<ambs; i++) { 1949 for (j=ai[i]; j<ai[i+1]; j++) { 1950 if (aj[j] == i) { 1951 row = i*bs; 1952 aa_j = aa+j*bs2; 1953 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1954 break; 1955 } 1956 } 1957 } 1958 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1959 PetscFunctionReturn(0); 1960 } 1961 1962 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 1963 { 1964 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1965 const PetscScalar *l,*r,*li,*ri; 1966 PetscScalar x; 1967 MatScalar *aa, *v; 1968 PetscErrorCode ierr; 1969 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 1970 const PetscInt *ai,*aj; 1971 1972 PetscFunctionBegin; 1973 ai = a->i; 1974 aj = a->j; 1975 aa = a->a; 1976 m = A->rmap->n; 1977 n = A->cmap->n; 1978 bs = A->rmap->bs; 1979 mbs = a->mbs; 1980 bs2 = a->bs2; 1981 if (ll) { 1982 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1983 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1984 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1985 for (i=0; i<mbs; i++) { /* for each block row */ 1986 M = ai[i+1] - ai[i]; 1987 li = l + i*bs; 1988 v = aa + bs2*ai[i]; 1989 for (j=0; j<M; j++) { /* for each block */ 1990 for (k=0; k<bs2; k++) { 1991 (*v++) *= li[k%bs]; 1992 } 1993 } 1994 } 1995 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1996 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1997 } 1998 1999 if (rr) { 2000 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2001 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2002 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2003 for (i=0; i<mbs; i++) { /* for each block row */ 2004 iai = ai[i]; 2005 M = ai[i+1] - iai; 2006 v = aa + bs2*iai; 2007 for (j=0; j<M; j++) { /* for each block */ 2008 ri = r + bs*aj[iai+j]; 2009 for (k=0; k<bs; k++) { 2010 x = ri[k]; 2011 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2012 v += bs; 2013 } 2014 } 2015 } 2016 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2017 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2018 } 2019 PetscFunctionReturn(0); 2020 } 2021 2022 2023 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2024 { 2025 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2026 2027 PetscFunctionBegin; 2028 info->block_size = a->bs2; 2029 info->nz_allocated = a->bs2*a->maxnz; 2030 info->nz_used = a->bs2*a->nz; 2031 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2032 info->assemblies = A->num_ass; 2033 info->mallocs = A->info.mallocs; 2034 info->memory = ((PetscObject)A)->mem; 2035 if (A->factortype) { 2036 info->fill_ratio_given = A->info.fill_ratio_given; 2037 info->fill_ratio_needed = A->info.fill_ratio_needed; 2038 info->factor_mallocs = A->info.factor_mallocs; 2039 } else { 2040 info->fill_ratio_given = 0; 2041 info->fill_ratio_needed = 0; 2042 info->factor_mallocs = 0; 2043 } 2044 PetscFunctionReturn(0); 2045 } 2046 2047 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2048 { 2049 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2050 PetscErrorCode ierr; 2051 2052 PetscFunctionBegin; 2053 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2054 PetscFunctionReturn(0); 2055 } 2056