1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <../src/mat/impls/dense/seq/dense.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 8 #include <immintrin.h> 9 #endif 10 11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 14 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 15 const PetscInt *idx; 16 PetscInt start,end,*ai,*aj,bs,*nidx2; 17 PetscBT table; 18 19 PetscFunctionBegin; 20 m = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 25 PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 PetscCall(PetscBTCreate(m,&table)); 28 PetscCall(PetscMalloc1(m+1,&nidx)); 29 PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2)); 30 31 for (i=0; i<is_max; i++) { 32 /* Initialise the two local arrays */ 33 isz = 0; 34 PetscCall(PetscBTMemzero(m,table)); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 PetscCall(ISGetIndices(is[i],&idx)); 38 PetscCall(ISGetLocalSize(is[i],&n)); 39 40 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 41 for (j=0; j<n; ++j) { 42 ival = idx[j]/bs; /* convert the indices into block indices */ 43 PetscCheck(ival<m,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 45 } 46 PetscCall(ISRestoreIndices(is[i],&idx)); 47 PetscCall(ISDestroy(&is[i])); 48 49 k = 0; 50 for (j=0; j<ov; j++) { /* for each overlap*/ 51 n = isz; 52 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 53 row = nidx[k]; 54 start = ai[row]; 55 end = ai[row+1]; 56 for (l = start; l<end; l++) { 57 val = aj[l]; 58 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 59 } 60 } 61 } 62 /* expand the Index Set */ 63 for (j=0; j<isz; j++) { 64 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 65 } 66 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i)); 67 } 68 PetscCall(PetscBTDestroy(&table)); 69 PetscCall(PetscFree(nidx)); 70 PetscCall(PetscFree(nidx2)); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 75 { 76 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 77 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 78 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 79 const PetscInt *irow,*icol; 80 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 81 PetscInt *aj = a->j,*ai = a->i; 82 MatScalar *mat_a; 83 Mat C; 84 PetscBool flag; 85 86 PetscFunctionBegin; 87 PetscCall(ISGetIndices(isrow,&irow)); 88 PetscCall(ISGetIndices(iscol,&icol)); 89 PetscCall(ISGetLocalSize(isrow,&nrows)); 90 PetscCall(ISGetLocalSize(iscol,&ncols)); 91 92 PetscCall(PetscCalloc1(1+oldcols,&smap)); 93 ssmap = smap; 94 PetscCall(PetscMalloc1(1+nrows,&lens)); 95 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 96 /* determine lens of each row */ 97 for (i=0; i<nrows; i++) { 98 kstart = ai[irow[i]]; 99 kend = kstart + a->ilen[irow[i]]; 100 lens[i] = 0; 101 for (k=kstart; k<kend; k++) { 102 if (ssmap[aj[k]]) lens[i]++; 103 } 104 } 105 /* Create and fill new matrix */ 106 if (scall == MAT_REUSE_MATRIX) { 107 c = (Mat_SeqBAIJ*)((*B)->data); 108 109 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 110 PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag)); 111 PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 112 PetscCall(PetscArrayzero(c->ilen,c->mbs)); 113 C = *B; 114 } else { 115 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C)); 116 PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE)); 117 PetscCall(MatSetType(C,((PetscObject)A)->type_name)); 118 PetscCall(MatSeqBAIJSetPreallocation(C,bs,0,lens)); 119 } 120 c = (Mat_SeqBAIJ*)(C->data); 121 for (i=0; i<nrows; i++) { 122 row = irow[i]; 123 kstart = ai[row]; 124 kend = kstart + a->ilen[row]; 125 mat_i = c->i[i]; 126 mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */ 127 mat_a = c->a ? c->a + mat_i*bs2 : NULL; /* mustn't add to NULL, that is UB */ 128 mat_ilen = c->ilen + i; 129 for (k=kstart; k<kend; k++) { 130 if ((tcol=ssmap[a->j[k]])) { 131 *mat_j++ = tcol - 1; 132 PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2)); 133 mat_a += bs2; 134 (*mat_ilen)++; 135 } 136 } 137 } 138 /* sort */ 139 if (c->j && c->a) { 140 MatScalar *work; 141 PetscCall(PetscMalloc1(bs2,&work)); 142 for (i=0; i<nrows; i++) { 143 PetscInt ilen; 144 mat_i = c->i[i]; 145 mat_j = c->j + mat_i; 146 mat_a = c->a + mat_i*bs2; 147 ilen = c->ilen[i]; 148 PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work)); 149 } 150 PetscCall(PetscFree(work)); 151 } 152 153 /* Free work space */ 154 PetscCall(ISRestoreIndices(iscol,&icol)); 155 PetscCall(PetscFree(smap)); 156 PetscCall(PetscFree(lens)); 157 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 158 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 159 160 PetscCall(ISRestoreIndices(isrow,&irow)); 161 *B = C; 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 166 { 167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 168 IS is1,is2; 169 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 170 const PetscInt *irow,*icol; 171 172 PetscFunctionBegin; 173 PetscCall(ISGetIndices(isrow,&irow)); 174 PetscCall(ISGetIndices(iscol,&icol)); 175 PetscCall(ISGetLocalSize(isrow,&nrows)); 176 PetscCall(ISGetLocalSize(iscol,&ncols)); 177 178 /* Verify if the indices corespond to each element in a block 179 and form the IS with compressed IS */ 180 maxmnbs = PetscMax(a->mbs,a->nbs); 181 PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary)); 182 PetscCall(PetscArrayzero(vary,a->mbs)); 183 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 184 for (i=0; i<a->mbs; i++) { 185 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 186 } 187 count = 0; 188 for (i=0; i<nrows; i++) { 189 j = irow[i] / bs; 190 if ((vary[j]--)==bs) iary[count++] = j; 191 } 192 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1)); 193 194 PetscCall(PetscArrayzero(vary,a->nbs)); 195 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 196 for (i=0; i<a->nbs; i++) { 197 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 198 } 199 count = 0; 200 for (i=0; i<ncols; i++) { 201 j = icol[i] / bs; 202 if ((vary[j]--)==bs) iary[count++] = j; 203 } 204 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2)); 205 PetscCall(ISRestoreIndices(isrow,&irow)); 206 PetscCall(ISRestoreIndices(iscol,&icol)); 207 PetscCall(PetscFree2(vary,iary)); 208 209 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B)); 210 PetscCall(ISDestroy(&is1)); 211 PetscCall(ISDestroy(&is2)); 212 PetscFunctionReturn(0); 213 } 214 215 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 216 { 217 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 218 Mat_SubSppt *submatj = c->submatis1; 219 220 PetscFunctionBegin; 221 PetscCall((*submatj->destroy)(C)); 222 PetscCall(MatDestroySubMatrix_Private(submatj)); 223 PetscFunctionReturn(0); 224 } 225 226 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ 227 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 228 { 229 PetscInt i; 230 Mat C; 231 Mat_SeqBAIJ *c; 232 Mat_SubSppt *submatj; 233 234 PetscFunctionBegin; 235 for (i=0; i<n; i++) { 236 C = (*mat)[i]; 237 c = (Mat_SeqBAIJ*)C->data; 238 submatj = c->submatis1; 239 if (submatj) { 240 if (--((PetscObject)C)->refct <= 0) { 241 PetscCall((*submatj->destroy)(C)); 242 PetscCall(MatDestroySubMatrix_Private(submatj)); 243 PetscCall(PetscFree(C->defaultvectype)); 244 PetscCall(PetscLayoutDestroy(&C->rmap)); 245 PetscCall(PetscLayoutDestroy(&C->cmap)); 246 PetscCall(PetscHeaderDestroy(&C)); 247 } 248 } else { 249 PetscCall(MatDestroy(&C)); 250 } 251 } 252 253 /* Destroy Dummy submatrices created for reuse */ 254 PetscCall(MatDestroySubMatrices_Dummy(n,mat)); 255 256 PetscCall(PetscFree(*mat)); 257 PetscFunctionReturn(0); 258 } 259 260 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 261 { 262 PetscInt i; 263 264 PetscFunctionBegin; 265 if (scall == MAT_INITIAL_MATRIX) { 266 PetscCall(PetscCalloc1(n+1,B)); 267 } 268 269 for (i=0; i<n; i++) { 270 PetscCall(MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i])); 271 } 272 PetscFunctionReturn(0); 273 } 274 275 /* -------------------------------------------------------*/ 276 /* Should check that shapes of vectors and matrices match */ 277 /* -------------------------------------------------------*/ 278 279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 282 PetscScalar *z,sum; 283 const PetscScalar *x; 284 const MatScalar *v; 285 PetscInt mbs,i,n; 286 const PetscInt *idx,*ii,*ridx=NULL; 287 PetscBool usecprow=a->compressedrow.use; 288 289 PetscFunctionBegin; 290 PetscCall(VecGetArrayRead(xx,&x)); 291 PetscCall(VecGetArrayWrite(zz,&z)); 292 293 if (usecprow) { 294 mbs = a->compressedrow.nrows; 295 ii = a->compressedrow.i; 296 ridx = a->compressedrow.rindex; 297 PetscCall(PetscArrayzero(z,a->mbs)); 298 } else { 299 mbs = a->mbs; 300 ii = a->i; 301 } 302 303 for (i=0; i<mbs; i++) { 304 n = ii[1] - ii[0]; 305 v = a->a + ii[0]; 306 idx = a->j + ii[0]; 307 ii++; 308 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 309 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 310 sum = 0.0; 311 PetscSparseDensePlusDot(sum,x,v,idx,n); 312 if (usecprow) { 313 z[ridx[i]] = sum; 314 } else { 315 z[i] = sum; 316 } 317 } 318 PetscCall(VecRestoreArrayRead(xx,&x)); 319 PetscCall(VecRestoreArrayWrite(zz,&z)); 320 PetscCall(PetscLogFlops(2.0*a->nz - a->nonzerorowcnt)); 321 PetscFunctionReturn(0); 322 } 323 324 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 325 { 326 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 327 PetscScalar *z = NULL,sum1,sum2,*zarray; 328 const PetscScalar *x,*xb; 329 PetscScalar x1,x2; 330 const MatScalar *v; 331 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 332 PetscBool usecprow=a->compressedrow.use; 333 334 PetscFunctionBegin; 335 PetscCall(VecGetArrayRead(xx,&x)); 336 PetscCall(VecGetArrayWrite(zz,&zarray)); 337 338 idx = a->j; 339 v = a->a; 340 if (usecprow) { 341 mbs = a->compressedrow.nrows; 342 ii = a->compressedrow.i; 343 ridx = a->compressedrow.rindex; 344 PetscCall(PetscArrayzero(zarray,2*a->mbs)); 345 } else { 346 mbs = a->mbs; 347 ii = a->i; 348 z = zarray; 349 } 350 351 for (i=0; i<mbs; i++) { 352 n = ii[1] - ii[0]; ii++; 353 sum1 = 0.0; sum2 = 0.0; 354 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 355 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 356 for (j=0; j<n; j++) { 357 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 358 sum1 += v[0]*x1 + v[2]*x2; 359 sum2 += v[1]*x1 + v[3]*x2; 360 v += 4; 361 } 362 if (usecprow) z = zarray + 2*ridx[i]; 363 z[0] = sum1; z[1] = sum2; 364 if (!usecprow) z += 2; 365 } 366 PetscCall(VecRestoreArrayRead(xx,&x)); 367 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 368 PetscCall(PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt)); 369 PetscFunctionReturn(0); 370 } 371 372 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 373 { 374 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 375 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray; 376 const PetscScalar *x,*xb; 377 const MatScalar *v; 378 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 379 PetscBool usecprow=a->compressedrow.use; 380 381 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 382 #pragma disjoint(*v,*z,*xb) 383 #endif 384 385 PetscFunctionBegin; 386 PetscCall(VecGetArrayRead(xx,&x)); 387 PetscCall(VecGetArrayWrite(zz,&zarray)); 388 389 idx = a->j; 390 v = a->a; 391 if (usecprow) { 392 mbs = a->compressedrow.nrows; 393 ii = a->compressedrow.i; 394 ridx = a->compressedrow.rindex; 395 PetscCall(PetscArrayzero(zarray,3*a->mbs)); 396 } else { 397 mbs = a->mbs; 398 ii = a->i; 399 z = zarray; 400 } 401 402 for (i=0; i<mbs; i++) { 403 n = ii[1] - ii[0]; ii++; 404 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 405 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 406 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 407 for (j=0; j<n; j++) { 408 xb = x + 3*(*idx++); 409 x1 = xb[0]; 410 x2 = xb[1]; 411 x3 = xb[2]; 412 413 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 414 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 415 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 416 v += 9; 417 } 418 if (usecprow) z = zarray + 3*ridx[i]; 419 z[0] = sum1; z[1] = sum2; z[2] = sum3; 420 if (!usecprow) z += 3; 421 } 422 PetscCall(VecRestoreArrayRead(xx,&x)); 423 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 424 PetscCall(PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt)); 425 PetscFunctionReturn(0); 426 } 427 428 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 429 { 430 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 431 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 432 const PetscScalar *x,*xb; 433 const MatScalar *v; 434 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 435 PetscBool usecprow=a->compressedrow.use; 436 437 PetscFunctionBegin; 438 PetscCall(VecGetArrayRead(xx,&x)); 439 PetscCall(VecGetArrayWrite(zz,&zarray)); 440 441 idx = a->j; 442 v = a->a; 443 if (usecprow) { 444 mbs = a->compressedrow.nrows; 445 ii = a->compressedrow.i; 446 ridx = a->compressedrow.rindex; 447 PetscCall(PetscArrayzero(zarray,4*a->mbs)); 448 } else { 449 mbs = a->mbs; 450 ii = a->i; 451 z = zarray; 452 } 453 454 for (i=0; i<mbs; i++) { 455 n = ii[1] - ii[0]; 456 ii++; 457 sum1 = 0.0; 458 sum2 = 0.0; 459 sum3 = 0.0; 460 sum4 = 0.0; 461 462 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 463 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 464 for (j=0; j<n; j++) { 465 xb = x + 4*(*idx++); 466 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 467 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 468 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 469 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 470 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 471 v += 16; 472 } 473 if (usecprow) z = zarray + 4*ridx[i]; 474 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 475 if (!usecprow) z += 4; 476 } 477 PetscCall(VecRestoreArrayRead(xx,&x)); 478 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 479 PetscCall(PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt)); 480 PetscFunctionReturn(0); 481 } 482 483 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 484 { 485 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 486 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray; 487 const PetscScalar *xb,*x; 488 const MatScalar *v; 489 const PetscInt *idx,*ii,*ridx=NULL; 490 PetscInt mbs,i,j,n; 491 PetscBool usecprow=a->compressedrow.use; 492 493 PetscFunctionBegin; 494 PetscCall(VecGetArrayRead(xx,&x)); 495 PetscCall(VecGetArrayWrite(zz,&zarray)); 496 497 idx = a->j; 498 v = a->a; 499 if (usecprow) { 500 mbs = a->compressedrow.nrows; 501 ii = a->compressedrow.i; 502 ridx = a->compressedrow.rindex; 503 PetscCall(PetscArrayzero(zarray,5*a->mbs)); 504 } else { 505 mbs = a->mbs; 506 ii = a->i; 507 z = zarray; 508 } 509 510 for (i=0; i<mbs; i++) { 511 n = ii[1] - ii[0]; ii++; 512 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 513 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 514 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 515 for (j=0; j<n; j++) { 516 xb = x + 5*(*idx++); 517 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 518 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 519 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 520 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 521 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 522 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 523 v += 25; 524 } 525 if (usecprow) z = zarray + 5*ridx[i]; 526 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 527 if (!usecprow) z += 5; 528 } 529 PetscCall(VecRestoreArrayRead(xx,&x)); 530 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 531 PetscCall(PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt)); 532 PetscFunctionReturn(0); 533 } 534 535 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 536 { 537 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 538 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 539 const PetscScalar *x,*xb; 540 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 541 const MatScalar *v; 542 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 543 PetscBool usecprow=a->compressedrow.use; 544 545 PetscFunctionBegin; 546 PetscCall(VecGetArrayRead(xx,&x)); 547 PetscCall(VecGetArrayWrite(zz,&zarray)); 548 549 idx = a->j; 550 v = a->a; 551 if (usecprow) { 552 mbs = a->compressedrow.nrows; 553 ii = a->compressedrow.i; 554 ridx = a->compressedrow.rindex; 555 PetscCall(PetscArrayzero(zarray,6*a->mbs)); 556 } else { 557 mbs = a->mbs; 558 ii = a->i; 559 z = zarray; 560 } 561 562 for (i=0; i<mbs; i++) { 563 n = ii[1] - ii[0]; 564 ii++; 565 sum1 = 0.0; 566 sum2 = 0.0; 567 sum3 = 0.0; 568 sum4 = 0.0; 569 sum5 = 0.0; 570 sum6 = 0.0; 571 572 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 573 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 574 for (j=0; j<n; j++) { 575 xb = x + 6*(*idx++); 576 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 577 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 578 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 579 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 580 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 581 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 582 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 583 v += 36; 584 } 585 if (usecprow) z = zarray + 6*ridx[i]; 586 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 587 if (!usecprow) z += 6; 588 } 589 590 PetscCall(VecRestoreArrayRead(xx,&x)); 591 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 592 PetscCall(PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt)); 593 PetscFunctionReturn(0); 594 } 595 596 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 597 { 598 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 599 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 600 const PetscScalar *x,*xb; 601 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 602 const MatScalar *v; 603 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 604 PetscBool usecprow=a->compressedrow.use; 605 606 PetscFunctionBegin; 607 PetscCall(VecGetArrayRead(xx,&x)); 608 PetscCall(VecGetArrayWrite(zz,&zarray)); 609 610 idx = a->j; 611 v = a->a; 612 if (usecprow) { 613 mbs = a->compressedrow.nrows; 614 ii = a->compressedrow.i; 615 ridx = a->compressedrow.rindex; 616 PetscCall(PetscArrayzero(zarray,7*a->mbs)); 617 } else { 618 mbs = a->mbs; 619 ii = a->i; 620 z = zarray; 621 } 622 623 for (i=0; i<mbs; i++) { 624 n = ii[1] - ii[0]; 625 ii++; 626 sum1 = 0.0; 627 sum2 = 0.0; 628 sum3 = 0.0; 629 sum4 = 0.0; 630 sum5 = 0.0; 631 sum6 = 0.0; 632 sum7 = 0.0; 633 634 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 635 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 636 for (j=0; j<n; j++) { 637 xb = x + 7*(*idx++); 638 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 639 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 640 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 641 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 642 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 643 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 644 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 645 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 646 v += 49; 647 } 648 if (usecprow) z = zarray + 7*ridx[i]; 649 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 650 if (!usecprow) z += 7; 651 } 652 653 PetscCall(VecRestoreArrayRead(xx,&x)); 654 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 655 PetscCall(PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt)); 656 PetscFunctionReturn(0); 657 } 658 659 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 660 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz) 661 { 662 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 663 PetscScalar *z = NULL,*work,*workt,*zarray; 664 const PetscScalar *x,*xb; 665 const MatScalar *v; 666 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 667 const PetscInt *idx,*ii,*ridx=NULL; 668 PetscInt k; 669 PetscBool usecprow=a->compressedrow.use; 670 671 __m256d a0,a1,a2,a3,a4,a5; 672 __m256d w0,w1,w2,w3; 673 __m256d z0,z1,z2; 674 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 675 676 PetscFunctionBegin; 677 PetscCall(VecGetArrayRead(xx,&x)); 678 PetscCall(VecGetArrayWrite(zz,&zarray)); 679 680 idx = a->j; 681 v = a->a; 682 if (usecprow) { 683 mbs = a->compressedrow.nrows; 684 ii = a->compressedrow.i; 685 ridx = a->compressedrow.rindex; 686 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 687 } else { 688 mbs = a->mbs; 689 ii = a->i; 690 z = zarray; 691 } 692 693 if (!a->mult_work) { 694 k = PetscMax(A->rmap->n,A->cmap->n); 695 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 696 } 697 698 work = a->mult_work; 699 for (i=0; i<mbs; i++) { 700 n = ii[1] - ii[0]; ii++; 701 workt = work; 702 for (j=0; j<n; j++) { 703 xb = x + bs*(*idx++); 704 for (k=0; k<bs; k++) workt[k] = xb[k]; 705 workt += bs; 706 } 707 if (usecprow) z = zarray + bs*ridx[i]; 708 709 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 710 711 for (j=0; j<n; j++) { 712 /* first column of a */ 713 w0 = _mm256_set1_pd(work[j*9 ]); 714 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 715 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 716 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 717 718 /* second column of a */ 719 w1 = _mm256_set1_pd(work[j*9+ 1]); 720 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 721 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 722 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 723 724 /* third column of a */ 725 w2 = _mm256_set1_pd(work[j*9 +2]); 726 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 727 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 728 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 729 730 /* fourth column of a */ 731 w3 = _mm256_set1_pd(work[j*9+ 3]); 732 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 733 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 734 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 735 736 /* fifth column of a */ 737 w0 = _mm256_set1_pd(work[j*9+ 4]); 738 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 739 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 740 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 741 742 /* sixth column of a */ 743 w1 = _mm256_set1_pd(work[j*9+ 5]); 744 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 745 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 746 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 747 748 /* seventh column of a */ 749 w2 = _mm256_set1_pd(work[j*9+ 6]); 750 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 751 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 752 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 753 754 /* eighth column of a */ 755 w3 = _mm256_set1_pd(work[j*9+ 7]); 756 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 757 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 758 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 759 760 /* ninth column of a */ 761 w0 = _mm256_set1_pd(work[j*9+ 8]); 762 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 763 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 764 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 765 } 766 767 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 768 769 v += n*bs2; 770 if (!usecprow) z += bs; 771 } 772 PetscCall(VecRestoreArrayRead(xx,&x)); 773 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 774 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 775 PetscFunctionReturn(0); 776 } 777 #endif 778 779 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 780 { 781 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 782 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 783 const PetscScalar *x,*xb; 784 PetscScalar *zarray,xv; 785 const MatScalar *v; 786 const PetscInt *ii,*ij=a->j,*idx; 787 PetscInt mbs,i,j,k,n,*ridx=NULL; 788 PetscBool usecprow=a->compressedrow.use; 789 790 PetscFunctionBegin; 791 PetscCall(VecGetArrayRead(xx,&x)); 792 PetscCall(VecGetArrayWrite(zz,&zarray)); 793 794 v = a->a; 795 if (usecprow) { 796 mbs = a->compressedrow.nrows; 797 ii = a->compressedrow.i; 798 ridx = a->compressedrow.rindex; 799 PetscCall(PetscArrayzero(zarray,11*a->mbs)); 800 } else { 801 mbs = a->mbs; 802 ii = a->i; 803 z = zarray; 804 } 805 806 for (i=0; i<mbs; i++) { 807 n = ii[i+1] - ii[i]; 808 idx = ij + ii[i]; 809 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 810 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 811 812 for (j=0; j<n; j++) { 813 xb = x + 11*(idx[j]); 814 815 for (k=0; k<11; k++) { 816 xv = xb[k]; 817 sum1 += v[0]*xv; 818 sum2 += v[1]*xv; 819 sum3 += v[2]*xv; 820 sum4 += v[3]*xv; 821 sum5 += v[4]*xv; 822 sum6 += v[5]*xv; 823 sum7 += v[6]*xv; 824 sum8 += v[7]*xv; 825 sum9 += v[8]*xv; 826 sum10 += v[9]*xv; 827 sum11 += v[10]*xv; 828 v += 11; 829 } 830 } 831 if (usecprow) z = zarray + 11*ridx[i]; 832 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 833 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 834 835 if (!usecprow) z += 11; 836 } 837 838 PetscCall(VecRestoreArrayRead(xx,&x)); 839 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 840 PetscCall(PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt)); 841 PetscFunctionReturn(0); 842 } 843 844 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ 845 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz) 846 { 847 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 848 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 849 const PetscScalar *x,*xb; 850 PetscScalar *zarray,xv; 851 const MatScalar *v; 852 const PetscInt *ii,*ij=a->j,*idx; 853 PetscInt mbs,i,j,k,n,*ridx=NULL; 854 PetscBool usecprow=a->compressedrow.use; 855 856 PetscFunctionBegin; 857 PetscCall(VecGetArrayRead(xx,&x)); 858 PetscCall(VecGetArrayWrite(zz,&zarray)); 859 860 v = a->a; 861 if (usecprow) { 862 mbs = a->compressedrow.nrows; 863 ii = a->compressedrow.i; 864 ridx = a->compressedrow.rindex; 865 PetscCall(PetscArrayzero(zarray,12*a->mbs)); 866 } else { 867 mbs = a->mbs; 868 ii = a->i; 869 z = zarray; 870 } 871 872 for (i=0; i<mbs; i++) { 873 n = ii[i+1] - ii[i]; 874 idx = ij + ii[i]; 875 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 876 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; 877 878 for (j=0; j<n; j++) { 879 xb = x + 12*(idx[j]); 880 881 for (k=0; k<12; k++) { 882 xv = xb[k]; 883 sum1 += v[0]*xv; 884 sum2 += v[1]*xv; 885 sum3 += v[2]*xv; 886 sum4 += v[3]*xv; 887 sum5 += v[4]*xv; 888 sum6 += v[5]*xv; 889 sum7 += v[6]*xv; 890 sum8 += v[7]*xv; 891 sum9 += v[8]*xv; 892 sum10 += v[9]*xv; 893 sum11 += v[10]*xv; 894 sum12 += v[11]*xv; 895 v += 12; 896 } 897 } 898 if (usecprow) z = zarray + 12*ridx[i]; 899 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 900 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 901 if (!usecprow) z += 12; 902 } 903 PetscCall(VecRestoreArrayRead(xx,&x)); 904 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 905 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 906 PetscFunctionReturn(0); 907 } 908 909 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz) 910 { 911 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 912 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 913 const PetscScalar *x,*xb; 914 PetscScalar *zarray,*yarray,xv; 915 const MatScalar *v; 916 const PetscInt *ii,*ij=a->j,*idx; 917 PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL; 918 PetscBool usecprow=a->compressedrow.use; 919 920 PetscFunctionBegin; 921 PetscCall(VecGetArrayRead(xx,&x)); 922 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 923 924 v = a->a; 925 if (usecprow) { 926 if (zz != yy) { 927 PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); 928 } 929 mbs = a->compressedrow.nrows; 930 ii = a->compressedrow.i; 931 ridx = a->compressedrow.rindex; 932 } else { 933 ii = a->i; 934 y = yarray; 935 z = zarray; 936 } 937 938 for (i=0; i<mbs; i++) { 939 n = ii[i+1] - ii[i]; 940 idx = ij + ii[i]; 941 942 if (usecprow) { 943 y = yarray + 12*ridx[i]; 944 z = zarray + 12*ridx[i]; 945 } 946 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 947 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 948 949 for (j=0; j<n; j++) { 950 xb = x + 12*(idx[j]); 951 952 for (k=0; k<12; k++) { 953 xv = xb[k]; 954 sum1 += v[0]*xv; 955 sum2 += v[1]*xv; 956 sum3 += v[2]*xv; 957 sum4 += v[3]*xv; 958 sum5 += v[4]*xv; 959 sum6 += v[5]*xv; 960 sum7 += v[6]*xv; 961 sum8 += v[7]*xv; 962 sum9 += v[8]*xv; 963 sum10 += v[9]*xv; 964 sum11 += v[10]*xv; 965 sum12 += v[11]*xv; 966 v += 12; 967 } 968 } 969 970 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 971 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 972 if (!usecprow) { 973 y += 12; 974 z += 12; 975 } 976 } 977 PetscCall(VecRestoreArrayRead(xx,&x)); 978 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 979 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 980 PetscFunctionReturn(0); 981 } 982 983 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 984 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz) 985 { 986 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 987 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 988 const PetscScalar *x,*xb; 989 PetscScalar x1,x2,x3,x4,*zarray; 990 const MatScalar *v; 991 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 992 PetscInt mbs,i,j,n; 993 PetscBool usecprow=a->compressedrow.use; 994 995 PetscFunctionBegin; 996 PetscCall(VecGetArrayRead(xx,&x)); 997 PetscCall(VecGetArrayWrite(zz,&zarray)); 998 999 v = a->a; 1000 if (usecprow) { 1001 mbs = a->compressedrow.nrows; 1002 ii = a->compressedrow.i; 1003 ridx = a->compressedrow.rindex; 1004 PetscCall(PetscArrayzero(zarray,12*a->mbs)); 1005 } else { 1006 mbs = a->mbs; 1007 ii = a->i; 1008 z = zarray; 1009 } 1010 1011 for (i=0; i<mbs; i++) { 1012 n = ii[i+1] - ii[i]; 1013 idx = ij + ii[i]; 1014 1015 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; 1016 for (j=0; j<n; j++) { 1017 xb = x + 12*(idx[j]); 1018 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1019 1020 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1021 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1022 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1023 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1024 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1025 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1026 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1027 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1028 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1029 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1030 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1031 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1032 v += 48; 1033 1034 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1035 1036 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1037 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1038 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1039 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1040 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1041 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1042 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1043 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1044 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1045 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1046 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1047 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1048 v += 48; 1049 1050 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1051 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1052 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1053 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1054 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1055 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1056 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1057 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1058 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1059 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1060 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1061 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1062 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1063 v += 48; 1064 1065 } 1066 if (usecprow) z = zarray + 12*ridx[i]; 1067 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1068 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1069 if (!usecprow) z += 12; 1070 } 1071 PetscCall(VecRestoreArrayRead(xx,&x)); 1072 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1073 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 1074 PetscFunctionReturn(0); 1075 } 1076 1077 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ 1078 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz) 1079 { 1080 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1081 PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12; 1082 const PetscScalar *x,*xb; 1083 PetscScalar x1,x2,x3,x4,*zarray,*yarray; 1084 const MatScalar *v; 1085 const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL; 1086 PetscInt mbs = a->mbs,i,j,n; 1087 PetscBool usecprow=a->compressedrow.use; 1088 1089 PetscFunctionBegin; 1090 PetscCall(VecGetArrayRead(xx,&x)); 1091 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1092 1093 v = a->a; 1094 if (usecprow) { 1095 if (zz != yy) { 1096 PetscCall(PetscArraycpy(zarray,yarray,12*mbs)); 1097 } 1098 mbs = a->compressedrow.nrows; 1099 ii = a->compressedrow.i; 1100 ridx = a->compressedrow.rindex; 1101 } else { 1102 ii = a->i; 1103 y = yarray; 1104 z = zarray; 1105 } 1106 1107 for (i=0; i<mbs; i++) { 1108 n = ii[i+1] - ii[i]; 1109 idx = ij + ii[i]; 1110 1111 if (usecprow) { 1112 y = yarray + 12*ridx[i]; 1113 z = zarray + 12*ridx[i]; 1114 } 1115 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1116 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; 1117 1118 for (j=0; j<n; j++) { 1119 xb = x + 12*(idx[j]); 1120 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1121 1122 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1123 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1124 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1125 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1126 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1127 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1128 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1129 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1130 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1131 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1132 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1133 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1134 v += 48; 1135 1136 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1137 1138 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1139 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1140 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1141 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1142 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1143 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1144 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1145 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1146 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1147 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1148 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1149 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1150 v += 48; 1151 1152 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1153 sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4; 1154 sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4; 1155 sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4; 1156 sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4; 1157 sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4; 1158 sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4; 1159 sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4; 1160 sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4; 1161 sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4; 1162 sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4; 1163 sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4; 1164 sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4; 1165 v += 48; 1166 1167 } 1168 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1169 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; 1170 if (!usecprow) { 1171 y += 12; 1172 z += 12; 1173 } 1174 } 1175 PetscCall(VecRestoreArrayRead(xx,&x)); 1176 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1177 PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt)); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 1182 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz) 1183 { 1184 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1185 PetscScalar *z = NULL,*zarray; 1186 const PetscScalar *x,*work; 1187 const MatScalar *v = a->a; 1188 PetscInt mbs,i,j,n; 1189 const PetscInt *idx = a->j,*ii,*ridx=NULL; 1190 PetscBool usecprow=a->compressedrow.use; 1191 const PetscInt bs = 12, bs2 = 144; 1192 1193 __m256d a0,a1,a2,a3,a4,a5; 1194 __m256d w0,w1,w2,w3; 1195 __m256d z0,z1,z2; 1196 1197 PetscFunctionBegin; 1198 PetscCall(VecGetArrayRead(xx,&x)); 1199 PetscCall(VecGetArrayWrite(zz,&zarray)); 1200 1201 if (usecprow) { 1202 mbs = a->compressedrow.nrows; 1203 ii = a->compressedrow.i; 1204 ridx = a->compressedrow.rindex; 1205 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 1206 } else { 1207 mbs = a->mbs; 1208 ii = a->i; 1209 z = zarray; 1210 } 1211 1212 for (i=0; i<mbs; i++) { 1213 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); 1214 1215 n = ii[1] - ii[0]; ii++; 1216 for (j=0; j<n; j++) { 1217 work = x + bs*(*idx++); 1218 1219 /* first column of a */ 1220 w0 = _mm256_set1_pd(work[0]); 1221 a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0); 1222 a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1); 1223 a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2); 1224 1225 /* second column of a */ 1226 w1 = _mm256_set1_pd(work[1]); 1227 a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0); 1228 a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1); 1229 a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2); 1230 1231 /* third column of a */ 1232 w2 = _mm256_set1_pd(work[2]); 1233 a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0); 1234 a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1); 1235 a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2); 1236 1237 /* fourth column of a */ 1238 w3 = _mm256_set1_pd(work[3]); 1239 a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0); 1240 a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1); 1241 a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2); 1242 1243 /* fifth column of a */ 1244 w0 = _mm256_set1_pd(work[4]); 1245 a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0); 1246 a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1); 1247 a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2); 1248 1249 /* sixth column of a */ 1250 w1 = _mm256_set1_pd(work[5]); 1251 a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0); 1252 a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1); 1253 a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2); 1254 1255 /* seventh column of a */ 1256 w2 = _mm256_set1_pd(work[6]); 1257 a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0); 1258 a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1); 1259 a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2); 1260 1261 /* eighth column of a */ 1262 w3 = _mm256_set1_pd(work[7]); 1263 a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0); 1264 a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1); 1265 a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2); 1266 1267 /* ninth column of a */ 1268 w0 = _mm256_set1_pd(work[8]); 1269 a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0); 1270 a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1); 1271 a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2); 1272 1273 /* tenth column of a */ 1274 w1 = _mm256_set1_pd(work[9]); 1275 a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0); 1276 a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1); 1277 a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2); 1278 1279 /* eleventh column of a */ 1280 w2 = _mm256_set1_pd(work[10]); 1281 a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0); 1282 a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1); 1283 a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2); 1284 1285 /* twelveth column of a */ 1286 w3 = _mm256_set1_pd(work[11]); 1287 a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0); 1288 a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1); 1289 a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2); 1290 1291 v += bs2; 1292 } 1293 if (usecprow) z = zarray + bs*ridx[i]; 1294 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2); 1295 if (!usecprow) z += bs; 1296 } 1297 PetscCall(VecRestoreArrayRead(xx,&x)); 1298 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1299 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 1300 PetscFunctionReturn(0); 1301 } 1302 #endif 1303 1304 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1305 /* Default MatMult for block size 15 */ 1306 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1307 { 1308 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1309 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1310 const PetscScalar *x,*xb; 1311 PetscScalar *zarray,xv; 1312 const MatScalar *v; 1313 const PetscInt *ii,*ij=a->j,*idx; 1314 PetscInt mbs,i,j,k,n,*ridx=NULL; 1315 PetscBool usecprow=a->compressedrow.use; 1316 1317 PetscFunctionBegin; 1318 PetscCall(VecGetArrayRead(xx,&x)); 1319 PetscCall(VecGetArrayWrite(zz,&zarray)); 1320 1321 v = a->a; 1322 if (usecprow) { 1323 mbs = a->compressedrow.nrows; 1324 ii = a->compressedrow.i; 1325 ridx = a->compressedrow.rindex; 1326 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1327 } else { 1328 mbs = a->mbs; 1329 ii = a->i; 1330 z = zarray; 1331 } 1332 1333 for (i=0; i<mbs; i++) { 1334 n = ii[i+1] - ii[i]; 1335 idx = ij + ii[i]; 1336 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1337 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1338 1339 for (j=0; j<n; j++) { 1340 xb = x + 15*(idx[j]); 1341 1342 for (k=0; k<15; k++) { 1343 xv = xb[k]; 1344 sum1 += v[0]*xv; 1345 sum2 += v[1]*xv; 1346 sum3 += v[2]*xv; 1347 sum4 += v[3]*xv; 1348 sum5 += v[4]*xv; 1349 sum6 += v[5]*xv; 1350 sum7 += v[6]*xv; 1351 sum8 += v[7]*xv; 1352 sum9 += v[8]*xv; 1353 sum10 += v[9]*xv; 1354 sum11 += v[10]*xv; 1355 sum12 += v[11]*xv; 1356 sum13 += v[12]*xv; 1357 sum14 += v[13]*xv; 1358 sum15 += v[14]*xv; 1359 v += 15; 1360 } 1361 } 1362 if (usecprow) z = zarray + 15*ridx[i]; 1363 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1364 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1365 1366 if (!usecprow) z += 15; 1367 } 1368 1369 PetscCall(VecRestoreArrayRead(xx,&x)); 1370 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1371 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1372 PetscFunctionReturn(0); 1373 } 1374 1375 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1376 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1377 { 1378 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1379 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1380 const PetscScalar *x,*xb; 1381 PetscScalar x1,x2,x3,x4,*zarray; 1382 const MatScalar *v; 1383 const PetscInt *ii,*ij=a->j,*idx; 1384 PetscInt mbs,i,j,n,*ridx=NULL; 1385 PetscBool usecprow=a->compressedrow.use; 1386 1387 PetscFunctionBegin; 1388 PetscCall(VecGetArrayRead(xx,&x)); 1389 PetscCall(VecGetArrayWrite(zz,&zarray)); 1390 1391 v = a->a; 1392 if (usecprow) { 1393 mbs = a->compressedrow.nrows; 1394 ii = a->compressedrow.i; 1395 ridx = a->compressedrow.rindex; 1396 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1397 } else { 1398 mbs = a->mbs; 1399 ii = a->i; 1400 z = zarray; 1401 } 1402 1403 for (i=0; i<mbs; i++) { 1404 n = ii[i+1] - ii[i]; 1405 idx = ij + ii[i]; 1406 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1407 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1408 1409 for (j=0; j<n; j++) { 1410 xb = x + 15*(idx[j]); 1411 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1412 1413 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1414 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1415 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1416 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1417 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1418 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1419 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1420 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1421 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1422 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1423 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1424 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1425 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1426 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1427 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1428 1429 v += 60; 1430 1431 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1432 1433 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1434 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1435 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1436 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1437 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1438 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1439 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1440 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1441 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1442 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1443 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1444 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1445 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1446 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1447 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1448 v += 60; 1449 1450 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1451 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1452 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1453 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1454 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1455 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1456 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1457 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1458 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1459 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1460 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1461 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1462 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1463 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1464 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1465 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1466 v += 60; 1467 1468 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1469 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1470 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1471 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1472 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1473 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1474 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1475 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1476 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1477 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1478 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1479 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1480 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1481 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1482 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1483 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1484 v += 45; 1485 } 1486 if (usecprow) z = zarray + 15*ridx[i]; 1487 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1488 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1489 1490 if (!usecprow) z += 15; 1491 } 1492 1493 PetscCall(VecRestoreArrayRead(xx,&x)); 1494 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1495 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1496 PetscFunctionReturn(0); 1497 } 1498 1499 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1500 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1501 { 1502 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1503 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1504 const PetscScalar *x,*xb; 1505 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1506 const MatScalar *v; 1507 const PetscInt *ii,*ij=a->j,*idx; 1508 PetscInt mbs,i,j,n,*ridx=NULL; 1509 PetscBool usecprow=a->compressedrow.use; 1510 1511 PetscFunctionBegin; 1512 PetscCall(VecGetArrayRead(xx,&x)); 1513 PetscCall(VecGetArrayWrite(zz,&zarray)); 1514 1515 v = a->a; 1516 if (usecprow) { 1517 mbs = a->compressedrow.nrows; 1518 ii = a->compressedrow.i; 1519 ridx = a->compressedrow.rindex; 1520 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1521 } else { 1522 mbs = a->mbs; 1523 ii = a->i; 1524 z = zarray; 1525 } 1526 1527 for (i=0; i<mbs; i++) { 1528 n = ii[i+1] - ii[i]; 1529 idx = ij + ii[i]; 1530 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1531 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1532 1533 for (j=0; j<n; j++) { 1534 xb = x + 15*(idx[j]); 1535 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1536 x8 = xb[7]; 1537 1538 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1539 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1540 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1541 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1542 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1543 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1544 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1545 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1546 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1547 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1548 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1549 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1550 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1551 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1552 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1553 v += 120; 1554 1555 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1556 1557 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1558 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1559 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1560 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1561 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1562 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1563 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1564 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1565 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1566 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1567 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1568 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1569 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1570 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1571 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1572 v += 105; 1573 } 1574 if (usecprow) z = zarray + 15*ridx[i]; 1575 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1576 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1577 1578 if (!usecprow) z += 15; 1579 } 1580 1581 PetscCall(VecRestoreArrayRead(xx,&x)); 1582 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1583 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1584 PetscFunctionReturn(0); 1585 } 1586 1587 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1588 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1589 { 1590 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1591 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1592 const PetscScalar *x,*xb; 1593 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1594 const MatScalar *v; 1595 const PetscInt *ii,*ij=a->j,*idx; 1596 PetscInt mbs,i,j,n,*ridx=NULL; 1597 PetscBool usecprow=a->compressedrow.use; 1598 1599 PetscFunctionBegin; 1600 PetscCall(VecGetArrayRead(xx,&x)); 1601 PetscCall(VecGetArrayWrite(zz,&zarray)); 1602 1603 v = a->a; 1604 if (usecprow) { 1605 mbs = a->compressedrow.nrows; 1606 ii = a->compressedrow.i; 1607 ridx = a->compressedrow.rindex; 1608 PetscCall(PetscArrayzero(zarray,15*a->mbs)); 1609 } else { 1610 mbs = a->mbs; 1611 ii = a->i; 1612 z = zarray; 1613 } 1614 1615 for (i=0; i<mbs; i++) { 1616 n = ii[i+1] - ii[i]; 1617 idx = ij + ii[i]; 1618 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1619 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1620 1621 for (j=0; j<n; j++) { 1622 xb = x + 15*(idx[j]); 1623 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1624 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1625 1626 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1627 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1628 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1629 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1630 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1631 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1632 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1633 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1634 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1635 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1636 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1637 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1638 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1639 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1640 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1641 v += 225; 1642 } 1643 if (usecprow) z = zarray + 15*ridx[i]; 1644 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1645 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1646 1647 if (!usecprow) z += 15; 1648 } 1649 1650 PetscCall(VecRestoreArrayRead(xx,&x)); 1651 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1652 PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt)); 1653 PetscFunctionReturn(0); 1654 } 1655 1656 /* 1657 This will not work with MatScalar == float because it calls the BLAS 1658 */ 1659 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1660 { 1661 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1662 PetscScalar *z = NULL,*work,*workt,*zarray; 1663 const PetscScalar *x,*xb; 1664 const MatScalar *v; 1665 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1666 const PetscInt *idx,*ii,*ridx=NULL; 1667 PetscInt ncols,k; 1668 PetscBool usecprow=a->compressedrow.use; 1669 1670 PetscFunctionBegin; 1671 PetscCall(VecGetArrayRead(xx,&x)); 1672 PetscCall(VecGetArrayWrite(zz,&zarray)); 1673 1674 idx = a->j; 1675 v = a->a; 1676 if (usecprow) { 1677 mbs = a->compressedrow.nrows; 1678 ii = a->compressedrow.i; 1679 ridx = a->compressedrow.rindex; 1680 PetscCall(PetscArrayzero(zarray,bs*a->mbs)); 1681 } else { 1682 mbs = a->mbs; 1683 ii = a->i; 1684 z = zarray; 1685 } 1686 1687 if (!a->mult_work) { 1688 k = PetscMax(A->rmap->n,A->cmap->n); 1689 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 1690 } 1691 work = a->mult_work; 1692 for (i=0; i<mbs; i++) { 1693 n = ii[1] - ii[0]; ii++; 1694 ncols = n*bs; 1695 workt = work; 1696 for (j=0; j<n; j++) { 1697 xb = x + bs*(*idx++); 1698 for (k=0; k<bs; k++) workt[k] = xb[k]; 1699 workt += bs; 1700 } 1701 if (usecprow) z = zarray + bs*ridx[i]; 1702 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1703 v += n*bs2; 1704 if (!usecprow) z += bs; 1705 } 1706 PetscCall(VecRestoreArrayRead(xx,&x)); 1707 PetscCall(VecRestoreArrayWrite(zz,&zarray)); 1708 PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt)); 1709 PetscFunctionReturn(0); 1710 } 1711 1712 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1713 { 1714 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1715 const PetscScalar *x; 1716 PetscScalar *y,*z,sum; 1717 const MatScalar *v; 1718 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1719 const PetscInt *idx,*ii; 1720 PetscBool usecprow=a->compressedrow.use; 1721 1722 PetscFunctionBegin; 1723 PetscCall(VecGetArrayRead(xx,&x)); 1724 PetscCall(VecGetArrayPair(yy,zz,&y,&z)); 1725 1726 idx = a->j; 1727 v = a->a; 1728 if (usecprow) { 1729 if (zz != yy) { 1730 PetscCall(PetscArraycpy(z,y,mbs)); 1731 } 1732 mbs = a->compressedrow.nrows; 1733 ii = a->compressedrow.i; 1734 ridx = a->compressedrow.rindex; 1735 } else { 1736 ii = a->i; 1737 } 1738 1739 for (i=0; i<mbs; i++) { 1740 n = ii[1] - ii[0]; 1741 ii++; 1742 if (!usecprow) { 1743 sum = y[i]; 1744 } else { 1745 sum = y[ridx[i]]; 1746 } 1747 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1748 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1749 PetscSparseDensePlusDot(sum,x,v,idx,n); 1750 v += n; 1751 idx += n; 1752 if (usecprow) { 1753 z[ridx[i]] = sum; 1754 } else { 1755 z[i] = sum; 1756 } 1757 } 1758 PetscCall(VecRestoreArrayRead(xx,&x)); 1759 PetscCall(VecRestoreArrayPair(yy,zz,&y,&z)); 1760 PetscCall(PetscLogFlops(2.0*a->nz)); 1761 PetscFunctionReturn(0); 1762 } 1763 1764 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1765 { 1766 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1767 PetscScalar *y = NULL,*z = NULL,sum1,sum2; 1768 const PetscScalar *x,*xb; 1769 PetscScalar x1,x2,*yarray,*zarray; 1770 const MatScalar *v; 1771 PetscInt mbs = a->mbs,i,n,j; 1772 const PetscInt *idx,*ii,*ridx = NULL; 1773 PetscBool usecprow = a->compressedrow.use; 1774 1775 PetscFunctionBegin; 1776 PetscCall(VecGetArrayRead(xx,&x)); 1777 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1778 1779 idx = a->j; 1780 v = a->a; 1781 if (usecprow) { 1782 if (zz != yy) { 1783 PetscCall(PetscArraycpy(zarray,yarray,2*mbs)); 1784 } 1785 mbs = a->compressedrow.nrows; 1786 ii = a->compressedrow.i; 1787 ridx = a->compressedrow.rindex; 1788 } else { 1789 ii = a->i; 1790 y = yarray; 1791 z = zarray; 1792 } 1793 1794 for (i=0; i<mbs; i++) { 1795 n = ii[1] - ii[0]; ii++; 1796 if (usecprow) { 1797 z = zarray + 2*ridx[i]; 1798 y = yarray + 2*ridx[i]; 1799 } 1800 sum1 = y[0]; sum2 = y[1]; 1801 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1802 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1803 for (j=0; j<n; j++) { 1804 xb = x + 2*(*idx++); 1805 x1 = xb[0]; 1806 x2 = xb[1]; 1807 1808 sum1 += v[0]*x1 + v[2]*x2; 1809 sum2 += v[1]*x1 + v[3]*x2; 1810 v += 4; 1811 } 1812 z[0] = sum1; z[1] = sum2; 1813 if (!usecprow) { 1814 z += 2; y += 2; 1815 } 1816 } 1817 PetscCall(VecRestoreArrayRead(xx,&x)); 1818 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1819 PetscCall(PetscLogFlops(4.0*a->nz)); 1820 PetscFunctionReturn(0); 1821 } 1822 1823 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1824 { 1825 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1826 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1827 const PetscScalar *x,*xb; 1828 const MatScalar *v; 1829 PetscInt mbs = a->mbs,i,j,n; 1830 const PetscInt *idx,*ii,*ridx = NULL; 1831 PetscBool usecprow = a->compressedrow.use; 1832 1833 PetscFunctionBegin; 1834 PetscCall(VecGetArrayRead(xx,&x)); 1835 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1836 1837 idx = a->j; 1838 v = a->a; 1839 if (usecprow) { 1840 if (zz != yy) { 1841 PetscCall(PetscArraycpy(zarray,yarray,3*mbs)); 1842 } 1843 mbs = a->compressedrow.nrows; 1844 ii = a->compressedrow.i; 1845 ridx = a->compressedrow.rindex; 1846 } else { 1847 ii = a->i; 1848 y = yarray; 1849 z = zarray; 1850 } 1851 1852 for (i=0; i<mbs; i++) { 1853 n = ii[1] - ii[0]; ii++; 1854 if (usecprow) { 1855 z = zarray + 3*ridx[i]; 1856 y = yarray + 3*ridx[i]; 1857 } 1858 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1859 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1860 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1861 for (j=0; j<n; j++) { 1862 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1863 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1864 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1865 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1866 v += 9; 1867 } 1868 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1869 if (!usecprow) { 1870 z += 3; y += 3; 1871 } 1872 } 1873 PetscCall(VecRestoreArrayRead(xx,&x)); 1874 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1875 PetscCall(PetscLogFlops(18.0*a->nz)); 1876 PetscFunctionReturn(0); 1877 } 1878 1879 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1880 { 1881 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1882 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1883 const PetscScalar *x,*xb; 1884 const MatScalar *v; 1885 PetscInt mbs = a->mbs,i,j,n; 1886 const PetscInt *idx,*ii,*ridx=NULL; 1887 PetscBool usecprow=a->compressedrow.use; 1888 1889 PetscFunctionBegin; 1890 PetscCall(VecGetArrayRead(xx,&x)); 1891 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1892 1893 idx = a->j; 1894 v = a->a; 1895 if (usecprow) { 1896 if (zz != yy) { 1897 PetscCall(PetscArraycpy(zarray,yarray,4*mbs)); 1898 } 1899 mbs = a->compressedrow.nrows; 1900 ii = a->compressedrow.i; 1901 ridx = a->compressedrow.rindex; 1902 } else { 1903 ii = a->i; 1904 y = yarray; 1905 z = zarray; 1906 } 1907 1908 for (i=0; i<mbs; i++) { 1909 n = ii[1] - ii[0]; ii++; 1910 if (usecprow) { 1911 z = zarray + 4*ridx[i]; 1912 y = yarray + 4*ridx[i]; 1913 } 1914 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1915 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1916 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1917 for (j=0; j<n; j++) { 1918 xb = x + 4*(*idx++); 1919 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1920 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1921 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1922 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1923 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1924 v += 16; 1925 } 1926 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1927 if (!usecprow) { 1928 z += 4; y += 4; 1929 } 1930 } 1931 PetscCall(VecRestoreArrayRead(xx,&x)); 1932 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1933 PetscCall(PetscLogFlops(32.0*a->nz)); 1934 PetscFunctionReturn(0); 1935 } 1936 1937 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1938 { 1939 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1940 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1941 const PetscScalar *x,*xb; 1942 PetscScalar *yarray,*zarray; 1943 const MatScalar *v; 1944 PetscInt mbs = a->mbs,i,j,n; 1945 const PetscInt *idx,*ii,*ridx = NULL; 1946 PetscBool usecprow=a->compressedrow.use; 1947 1948 PetscFunctionBegin; 1949 PetscCall(VecGetArrayRead(xx,&x)); 1950 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 1951 1952 idx = a->j; 1953 v = a->a; 1954 if (usecprow) { 1955 if (zz != yy) { 1956 PetscCall(PetscArraycpy(zarray,yarray,5*mbs)); 1957 } 1958 mbs = a->compressedrow.nrows; 1959 ii = a->compressedrow.i; 1960 ridx = a->compressedrow.rindex; 1961 } else { 1962 ii = a->i; 1963 y = yarray; 1964 z = zarray; 1965 } 1966 1967 for (i=0; i<mbs; i++) { 1968 n = ii[1] - ii[0]; ii++; 1969 if (usecprow) { 1970 z = zarray + 5*ridx[i]; 1971 y = yarray + 5*ridx[i]; 1972 } 1973 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1974 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1975 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1976 for (j=0; j<n; j++) { 1977 xb = x + 5*(*idx++); 1978 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1979 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1980 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1981 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1982 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1983 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1984 v += 25; 1985 } 1986 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1987 if (!usecprow) { 1988 z += 5; y += 5; 1989 } 1990 } 1991 PetscCall(VecRestoreArrayRead(xx,&x)); 1992 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 1993 PetscCall(PetscLogFlops(50.0*a->nz)); 1994 PetscFunctionReturn(0); 1995 } 1996 1997 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1998 { 1999 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2000 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6; 2001 const PetscScalar *x,*xb; 2002 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 2003 const MatScalar *v; 2004 PetscInt mbs = a->mbs,i,j,n; 2005 const PetscInt *idx,*ii,*ridx=NULL; 2006 PetscBool usecprow=a->compressedrow.use; 2007 2008 PetscFunctionBegin; 2009 PetscCall(VecGetArrayRead(xx,&x)); 2010 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2011 2012 idx = a->j; 2013 v = a->a; 2014 if (usecprow) { 2015 if (zz != yy) { 2016 PetscCall(PetscArraycpy(zarray,yarray,6*mbs)); 2017 } 2018 mbs = a->compressedrow.nrows; 2019 ii = a->compressedrow.i; 2020 ridx = a->compressedrow.rindex; 2021 } else { 2022 ii = a->i; 2023 y = yarray; 2024 z = zarray; 2025 } 2026 2027 for (i=0; i<mbs; i++) { 2028 n = ii[1] - ii[0]; ii++; 2029 if (usecprow) { 2030 z = zarray + 6*ridx[i]; 2031 y = yarray + 6*ridx[i]; 2032 } 2033 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2034 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2035 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2036 for (j=0; j<n; j++) { 2037 xb = x + 6*(*idx++); 2038 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2039 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2040 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2041 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2042 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2043 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2044 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2045 v += 36; 2046 } 2047 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2048 if (!usecprow) { 2049 z += 6; y += 6; 2050 } 2051 } 2052 PetscCall(VecRestoreArrayRead(xx,&x)); 2053 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2054 PetscCall(PetscLogFlops(72.0*a->nz)); 2055 PetscFunctionReturn(0); 2056 } 2057 2058 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2059 { 2060 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2061 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2062 const PetscScalar *x,*xb; 2063 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2064 const MatScalar *v; 2065 PetscInt mbs = a->mbs,i,j,n; 2066 const PetscInt *idx,*ii,*ridx = NULL; 2067 PetscBool usecprow=a->compressedrow.use; 2068 2069 PetscFunctionBegin; 2070 PetscCall(VecGetArrayRead(xx,&x)); 2071 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2072 2073 idx = a->j; 2074 v = a->a; 2075 if (usecprow) { 2076 if (zz != yy) { 2077 PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); 2078 } 2079 mbs = a->compressedrow.nrows; 2080 ii = a->compressedrow.i; 2081 ridx = a->compressedrow.rindex; 2082 } else { 2083 ii = a->i; 2084 y = yarray; 2085 z = zarray; 2086 } 2087 2088 for (i=0; i<mbs; i++) { 2089 n = ii[1] - ii[0]; ii++; 2090 if (usecprow) { 2091 z = zarray + 7*ridx[i]; 2092 y = yarray + 7*ridx[i]; 2093 } 2094 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2095 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2096 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2097 for (j=0; j<n; j++) { 2098 xb = x + 7*(*idx++); 2099 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2100 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2101 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2102 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2103 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2104 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2105 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2106 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2107 v += 49; 2108 } 2109 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2110 if (!usecprow) { 2111 z += 7; y += 7; 2112 } 2113 } 2114 PetscCall(VecRestoreArrayRead(xx,&x)); 2115 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2116 PetscCall(PetscLogFlops(98.0*a->nz)); 2117 PetscFunctionReturn(0); 2118 } 2119 2120 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 2121 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz) 2122 { 2123 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2124 PetscScalar *z = NULL,*work,*workt,*zarray; 2125 const PetscScalar *x,*xb; 2126 const MatScalar *v; 2127 PetscInt mbs,i,j,n; 2128 PetscInt k; 2129 PetscBool usecprow=a->compressedrow.use; 2130 const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81; 2131 2132 __m256d a0,a1,a2,a3,a4,a5; 2133 __m256d w0,w1,w2,w3; 2134 __m256d z0,z1,z2; 2135 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63); 2136 2137 PetscFunctionBegin; 2138 PetscCall(VecCopy(yy,zz)); 2139 PetscCall(VecGetArrayRead(xx,&x)); 2140 PetscCall(VecGetArray(zz,&zarray)); 2141 2142 idx = a->j; 2143 v = a->a; 2144 if (usecprow) { 2145 mbs = a->compressedrow.nrows; 2146 ii = a->compressedrow.i; 2147 ridx = a->compressedrow.rindex; 2148 } else { 2149 mbs = a->mbs; 2150 ii = a->i; 2151 z = zarray; 2152 } 2153 2154 if (!a->mult_work) { 2155 k = PetscMax(A->rmap->n,A->cmap->n); 2156 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2157 } 2158 2159 work = a->mult_work; 2160 for (i=0; i<mbs; i++) { 2161 n = ii[1] - ii[0]; ii++; 2162 workt = work; 2163 for (j=0; j<n; j++) { 2164 xb = x + bs*(*idx++); 2165 for (k=0; k<bs; k++) workt[k] = xb[k]; 2166 workt += bs; 2167 } 2168 if (usecprow) z = zarray + bs*ridx[i]; 2169 2170 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]); 2171 2172 for (j=0; j<n; j++) { 2173 /* first column of a */ 2174 w0 = _mm256_set1_pd(work[j*9 ]); 2175 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2176 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2177 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2); 2178 2179 /* second column of a */ 2180 w1 = _mm256_set1_pd(work[j*9+ 1]); 2181 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2182 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2183 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2184 2185 /* third column of a */ 2186 w2 = _mm256_set1_pd(work[j*9+ 2]); 2187 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0); 2188 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1); 2189 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2); 2190 2191 /* fourth column of a */ 2192 w3 = _mm256_set1_pd(work[j*9+ 3]); 2193 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0); 2194 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1); 2195 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2); 2196 2197 /* fifth column of a */ 2198 w0 = _mm256_set1_pd(work[j*9+ 4]); 2199 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0); 2200 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1); 2201 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2); 2202 2203 /* sixth column of a */ 2204 w1 = _mm256_set1_pd(work[j*9+ 5]); 2205 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0); 2206 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1); 2207 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2); 2208 2209 /* seventh column of a */ 2210 w2 = _mm256_set1_pd(work[j*9+ 6]); 2211 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0); 2212 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1); 2213 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2); 2214 2215 /* eighth column of a */ 2216 w3 = _mm256_set1_pd(work[j*9+ 7]); 2217 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0); 2218 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1); 2219 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2); 2220 2221 /* ninth column of a */ 2222 w0 = _mm256_set1_pd(work[j*9+ 8]); 2223 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0); 2224 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1); 2225 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2); 2226 } 2227 2228 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); 2229 2230 v += n*bs2; 2231 if (!usecprow) z += bs; 2232 } 2233 PetscCall(VecRestoreArrayRead(xx,&x)); 2234 PetscCall(VecRestoreArray(zz,&zarray)); 2235 PetscCall(PetscLogFlops(162.0*a->nz)); 2236 PetscFunctionReturn(0); 2237 } 2238 #endif 2239 2240 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 2241 { 2242 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2243 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 2244 const PetscScalar *x,*xb; 2245 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 2246 const MatScalar *v; 2247 PetscInt mbs = a->mbs,i,j,n; 2248 const PetscInt *idx,*ii,*ridx = NULL; 2249 PetscBool usecprow=a->compressedrow.use; 2250 2251 PetscFunctionBegin; 2252 PetscCall(VecGetArrayRead(xx,&x)); 2253 PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray)); 2254 2255 idx = a->j; 2256 v = a->a; 2257 if (usecprow) { 2258 if (zz != yy) { 2259 PetscCall(PetscArraycpy(zarray,yarray,7*mbs)); 2260 } 2261 mbs = a->compressedrow.nrows; 2262 ii = a->compressedrow.i; 2263 ridx = a->compressedrow.rindex; 2264 } else { 2265 ii = a->i; 2266 y = yarray; 2267 z = zarray; 2268 } 2269 2270 for (i=0; i<mbs; i++) { 2271 n = ii[1] - ii[0]; ii++; 2272 if (usecprow) { 2273 z = zarray + 11*ridx[i]; 2274 y = yarray + 11*ridx[i]; 2275 } 2276 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2277 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 2278 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2279 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2280 for (j=0; j<n; j++) { 2281 xb = x + 11*(*idx++); 2282 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 2283 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 2284 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 2285 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 2286 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 2287 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 2288 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 2289 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 2290 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 2291 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 2292 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 2293 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 2294 v += 121; 2295 } 2296 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2297 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 2298 if (!usecprow) { 2299 z += 11; y += 11; 2300 } 2301 } 2302 PetscCall(VecRestoreArrayRead(xx,&x)); 2303 PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray)); 2304 PetscCall(PetscLogFlops(242.0*a->nz)); 2305 PetscFunctionReturn(0); 2306 } 2307 2308 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2309 { 2310 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2311 PetscScalar *z = NULL,*work,*workt,*zarray; 2312 const PetscScalar *x,*xb; 2313 const MatScalar *v; 2314 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2315 PetscInt ncols,k; 2316 const PetscInt *ridx = NULL,*idx,*ii; 2317 PetscBool usecprow = a->compressedrow.use; 2318 2319 PetscFunctionBegin; 2320 PetscCall(VecCopy(yy,zz)); 2321 PetscCall(VecGetArrayRead(xx,&x)); 2322 PetscCall(VecGetArray(zz,&zarray)); 2323 2324 idx = a->j; 2325 v = a->a; 2326 if (usecprow) { 2327 mbs = a->compressedrow.nrows; 2328 ii = a->compressedrow.i; 2329 ridx = a->compressedrow.rindex; 2330 } else { 2331 mbs = a->mbs; 2332 ii = a->i; 2333 z = zarray; 2334 } 2335 2336 if (!a->mult_work) { 2337 k = PetscMax(A->rmap->n,A->cmap->n); 2338 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2339 } 2340 work = a->mult_work; 2341 for (i=0; i<mbs; i++) { 2342 n = ii[1] - ii[0]; ii++; 2343 ncols = n*bs; 2344 workt = work; 2345 for (j=0; j<n; j++) { 2346 xb = x + bs*(*idx++); 2347 for (k=0; k<bs; k++) workt[k] = xb[k]; 2348 workt += bs; 2349 } 2350 if (usecprow) z = zarray + bs*ridx[i]; 2351 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2352 v += n*bs2; 2353 if (!usecprow) z += bs; 2354 } 2355 PetscCall(VecRestoreArrayRead(xx,&x)); 2356 PetscCall(VecRestoreArray(zz,&zarray)); 2357 PetscCall(PetscLogFlops(2.0*a->nz*bs2)); 2358 PetscFunctionReturn(0); 2359 } 2360 2361 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2362 { 2363 PetscScalar zero = 0.0; 2364 2365 PetscFunctionBegin; 2366 PetscCall(VecSet(zz,zero)); 2367 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz)); 2368 PetscFunctionReturn(0); 2369 } 2370 2371 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2372 { 2373 PetscScalar zero = 0.0; 2374 2375 PetscFunctionBegin; 2376 PetscCall(VecSet(zz,zero)); 2377 PetscCall(MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz)); 2378 PetscFunctionReturn(0); 2379 } 2380 2381 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2382 { 2383 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2384 PetscScalar *z,x1,x2,x3,x4,x5; 2385 const PetscScalar *x,*xb = NULL; 2386 const MatScalar *v; 2387 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 2388 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2389 Mat_CompressedRow cprow = a->compressedrow; 2390 PetscBool usecprow = cprow.use; 2391 2392 PetscFunctionBegin; 2393 if (yy != zz) PetscCall(VecCopy(yy,zz)); 2394 PetscCall(VecGetArrayRead(xx,&x)); 2395 PetscCall(VecGetArray(zz,&z)); 2396 2397 idx = a->j; 2398 v = a->a; 2399 if (usecprow) { 2400 mbs = cprow.nrows; 2401 ii = cprow.i; 2402 ridx = cprow.rindex; 2403 } else { 2404 mbs=a->mbs; 2405 ii = a->i; 2406 xb = x; 2407 } 2408 2409 switch (bs) { 2410 case 1: 2411 for (i=0; i<mbs; i++) { 2412 if (usecprow) xb = x + ridx[i]; 2413 x1 = xb[0]; 2414 ib = idx + ii[0]; 2415 n = ii[1] - ii[0]; ii++; 2416 for (j=0; j<n; j++) { 2417 rval = ib[j]; 2418 z[rval] += PetscConj(*v) * x1; 2419 v++; 2420 } 2421 if (!usecprow) xb++; 2422 } 2423 break; 2424 case 2: 2425 for (i=0; i<mbs; i++) { 2426 if (usecprow) xb = x + 2*ridx[i]; 2427 x1 = xb[0]; x2 = xb[1]; 2428 ib = idx + ii[0]; 2429 n = ii[1] - ii[0]; ii++; 2430 for (j=0; j<n; j++) { 2431 rval = ib[j]*2; 2432 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2433 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2434 v += 4; 2435 } 2436 if (!usecprow) xb += 2; 2437 } 2438 break; 2439 case 3: 2440 for (i=0; i<mbs; i++) { 2441 if (usecprow) xb = x + 3*ridx[i]; 2442 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2443 ib = idx + ii[0]; 2444 n = ii[1] - ii[0]; ii++; 2445 for (j=0; j<n; j++) { 2446 rval = ib[j]*3; 2447 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2448 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2449 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2450 v += 9; 2451 } 2452 if (!usecprow) xb += 3; 2453 } 2454 break; 2455 case 4: 2456 for (i=0; i<mbs; i++) { 2457 if (usecprow) xb = x + 4*ridx[i]; 2458 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2459 ib = idx + ii[0]; 2460 n = ii[1] - ii[0]; ii++; 2461 for (j=0; j<n; j++) { 2462 rval = ib[j]*4; 2463 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2464 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2465 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2466 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2467 v += 16; 2468 } 2469 if (!usecprow) xb += 4; 2470 } 2471 break; 2472 case 5: 2473 for (i=0; i<mbs; i++) { 2474 if (usecprow) xb = x + 5*ridx[i]; 2475 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2476 x4 = xb[3]; x5 = xb[4]; 2477 ib = idx + ii[0]; 2478 n = ii[1] - ii[0]; ii++; 2479 for (j=0; j<n; j++) { 2480 rval = ib[j]*5; 2481 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2482 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2483 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2484 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2485 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2486 v += 25; 2487 } 2488 if (!usecprow) xb += 5; 2489 } 2490 break; 2491 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2492 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2493 #if 0 2494 { 2495 PetscInt ncols,k,bs2=a->bs2; 2496 PetscScalar *work,*workt,zb; 2497 const PetscScalar *xtmp; 2498 if (!a->mult_work) { 2499 k = PetscMax(A->rmap->n,A->cmap->n); 2500 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2501 } 2502 work = a->mult_work; 2503 xtmp = x; 2504 for (i=0; i<mbs; i++) { 2505 n = ii[1] - ii[0]; ii++; 2506 ncols = n*bs; 2507 PetscCall(PetscArrayzero(work,ncols)); 2508 if (usecprow) xtmp = x + bs*ridx[i]; 2509 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2510 v += n*bs2; 2511 if (!usecprow) xtmp += bs; 2512 workt = work; 2513 for (j=0; j<n; j++) { 2514 zb = z + bs*(*idx++); 2515 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2516 workt += bs; 2517 } 2518 } 2519 } 2520 #endif 2521 } 2522 PetscCall(VecRestoreArrayRead(xx,&x)); 2523 PetscCall(VecRestoreArray(zz,&z)); 2524 PetscCall(PetscLogFlops(2.0*a->nz*a->bs2)); 2525 PetscFunctionReturn(0); 2526 } 2527 2528 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2529 { 2530 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2531 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2532 const PetscScalar *x,*xb = NULL; 2533 const MatScalar *v; 2534 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2535 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2536 Mat_CompressedRow cprow = a->compressedrow; 2537 PetscBool usecprow=cprow.use; 2538 2539 PetscFunctionBegin; 2540 if (yy != zz) PetscCall(VecCopy(yy,zz)); 2541 PetscCall(VecGetArrayRead(xx,&x)); 2542 PetscCall(VecGetArray(zz,&z)); 2543 2544 idx = a->j; 2545 v = a->a; 2546 if (usecprow) { 2547 mbs = cprow.nrows; 2548 ii = cprow.i; 2549 ridx = cprow.rindex; 2550 } else { 2551 mbs=a->mbs; 2552 ii = a->i; 2553 xb = x; 2554 } 2555 2556 switch (bs) { 2557 case 1: 2558 for (i=0; i<mbs; i++) { 2559 if (usecprow) xb = x + ridx[i]; 2560 x1 = xb[0]; 2561 ib = idx + ii[0]; 2562 n = ii[1] - ii[0]; ii++; 2563 for (j=0; j<n; j++) { 2564 rval = ib[j]; 2565 z[rval] += *v * x1; 2566 v++; 2567 } 2568 if (!usecprow) xb++; 2569 } 2570 break; 2571 case 2: 2572 for (i=0; i<mbs; i++) { 2573 if (usecprow) xb = x + 2*ridx[i]; 2574 x1 = xb[0]; x2 = xb[1]; 2575 ib = idx + ii[0]; 2576 n = ii[1] - ii[0]; ii++; 2577 for (j=0; j<n; j++) { 2578 rval = ib[j]*2; 2579 z[rval++] += v[0]*x1 + v[1]*x2; 2580 z[rval++] += v[2]*x1 + v[3]*x2; 2581 v += 4; 2582 } 2583 if (!usecprow) xb += 2; 2584 } 2585 break; 2586 case 3: 2587 for (i=0; i<mbs; i++) { 2588 if (usecprow) xb = x + 3*ridx[i]; 2589 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2590 ib = idx + ii[0]; 2591 n = ii[1] - ii[0]; ii++; 2592 for (j=0; j<n; j++) { 2593 rval = ib[j]*3; 2594 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2595 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2596 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2597 v += 9; 2598 } 2599 if (!usecprow) xb += 3; 2600 } 2601 break; 2602 case 4: 2603 for (i=0; i<mbs; i++) { 2604 if (usecprow) xb = x + 4*ridx[i]; 2605 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2606 ib = idx + ii[0]; 2607 n = ii[1] - ii[0]; ii++; 2608 for (j=0; j<n; j++) { 2609 rval = ib[j]*4; 2610 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2611 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2612 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2613 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2614 v += 16; 2615 } 2616 if (!usecprow) xb += 4; 2617 } 2618 break; 2619 case 5: 2620 for (i=0; i<mbs; i++) { 2621 if (usecprow) xb = x + 5*ridx[i]; 2622 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2623 x4 = xb[3]; x5 = xb[4]; 2624 ib = idx + ii[0]; 2625 n = ii[1] - ii[0]; ii++; 2626 for (j=0; j<n; j++) { 2627 rval = ib[j]*5; 2628 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2629 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2630 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2631 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2632 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2633 v += 25; 2634 } 2635 if (!usecprow) xb += 5; 2636 } 2637 break; 2638 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2639 PetscInt ncols,k; 2640 PetscScalar *work,*workt; 2641 const PetscScalar *xtmp; 2642 if (!a->mult_work) { 2643 k = PetscMax(A->rmap->n,A->cmap->n); 2644 PetscCall(PetscMalloc1(k+1,&a->mult_work)); 2645 } 2646 work = a->mult_work; 2647 xtmp = x; 2648 for (i=0; i<mbs; i++) { 2649 n = ii[1] - ii[0]; ii++; 2650 ncols = n*bs; 2651 PetscCall(PetscArrayzero(work,ncols)); 2652 if (usecprow) xtmp = x + bs*ridx[i]; 2653 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2654 v += n*bs2; 2655 if (!usecprow) xtmp += bs; 2656 workt = work; 2657 for (j=0; j<n; j++) { 2658 zb = z + bs*(*idx++); 2659 for (k=0; k<bs; k++) zb[k] += workt[k]; 2660 workt += bs; 2661 } 2662 } 2663 } 2664 } 2665 PetscCall(VecRestoreArrayRead(xx,&x)); 2666 PetscCall(VecRestoreArray(zz,&z)); 2667 PetscCall(PetscLogFlops(2.0*a->nz*a->bs2)); 2668 PetscFunctionReturn(0); 2669 } 2670 2671 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2672 { 2673 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2674 PetscInt totalnz = a->bs2*a->nz; 2675 PetscScalar oalpha = alpha; 2676 PetscBLASInt one = 1,tnz; 2677 2678 PetscFunctionBegin; 2679 PetscCall(PetscBLASIntCast(totalnz,&tnz)); 2680 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2681 PetscCall(PetscLogFlops(totalnz)); 2682 PetscFunctionReturn(0); 2683 } 2684 2685 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2686 { 2687 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2688 MatScalar *v = a->a; 2689 PetscReal sum = 0.0; 2690 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2691 2692 PetscFunctionBegin; 2693 if (type == NORM_FROBENIUS) { 2694 #if defined(PETSC_USE_REAL___FP16) 2695 PetscBLASInt one = 1,cnt = bs2*nz; 2696 PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one)); 2697 #else 2698 for (i=0; i<bs2*nz; i++) { 2699 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2700 } 2701 #endif 2702 *norm = PetscSqrtReal(sum); 2703 PetscCall(PetscLogFlops(2.0*bs2*nz)); 2704 } else if (type == NORM_1) { /* maximum column sum */ 2705 PetscReal *tmp; 2706 PetscInt *bcol = a->j; 2707 PetscCall(PetscCalloc1(A->cmap->n+1,&tmp)); 2708 for (i=0; i<nz; i++) { 2709 for (j=0; j<bs; j++) { 2710 k1 = bs*(*bcol) + j; /* column index */ 2711 for (k=0; k<bs; k++) { 2712 tmp[k1] += PetscAbsScalar(*v); v++; 2713 } 2714 } 2715 bcol++; 2716 } 2717 *norm = 0.0; 2718 for (j=0; j<A->cmap->n; j++) { 2719 if (tmp[j] > *norm) *norm = tmp[j]; 2720 } 2721 PetscCall(PetscFree(tmp)); 2722 PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); 2723 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2724 *norm = 0.0; 2725 for (k=0; k<bs; k++) { 2726 for (j=0; j<a->mbs; j++) { 2727 v = a->a + bs2*a->i[j] + k; 2728 sum = 0.0; 2729 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2730 for (k1=0; k1<bs; k1++) { 2731 sum += PetscAbsScalar(*v); 2732 v += bs; 2733 } 2734 } 2735 if (sum > *norm) *norm = sum; 2736 } 2737 } 2738 PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0))); 2739 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2740 PetscFunctionReturn(0); 2741 } 2742 2743 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2744 { 2745 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2746 2747 PetscFunctionBegin; 2748 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2749 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2750 *flg = PETSC_FALSE; 2751 PetscFunctionReturn(0); 2752 } 2753 2754 /* if the a->i are the same */ 2755 PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg)); 2756 if (!*flg) PetscFunctionReturn(0); 2757 2758 /* if a->j are the same */ 2759 PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg)); 2760 if (!*flg) PetscFunctionReturn(0); 2761 2762 /* if a->a are the same */ 2763 PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg)); 2764 PetscFunctionReturn(0); 2765 2766 } 2767 2768 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2769 { 2770 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2771 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2772 PetscScalar *x,zero = 0.0; 2773 MatScalar *aa,*aa_j; 2774 2775 PetscFunctionBegin; 2776 PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2777 bs = A->rmap->bs; 2778 aa = a->a; 2779 ai = a->i; 2780 aj = a->j; 2781 ambs = a->mbs; 2782 bs2 = a->bs2; 2783 2784 PetscCall(VecSet(v,zero)); 2785 PetscCall(VecGetArray(v,&x)); 2786 PetscCall(VecGetLocalSize(v,&n)); 2787 PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2788 for (i=0; i<ambs; i++) { 2789 for (j=ai[i]; j<ai[i+1]; j++) { 2790 if (aj[j] == i) { 2791 row = i*bs; 2792 aa_j = aa+j*bs2; 2793 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2794 break; 2795 } 2796 } 2797 } 2798 PetscCall(VecRestoreArray(v,&x)); 2799 PetscFunctionReturn(0); 2800 } 2801 2802 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2803 { 2804 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2805 const PetscScalar *l,*r,*li,*ri; 2806 PetscScalar x; 2807 MatScalar *aa, *v; 2808 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2809 const PetscInt *ai,*aj; 2810 2811 PetscFunctionBegin; 2812 ai = a->i; 2813 aj = a->j; 2814 aa = a->a; 2815 m = A->rmap->n; 2816 n = A->cmap->n; 2817 bs = A->rmap->bs; 2818 mbs = a->mbs; 2819 bs2 = a->bs2; 2820 if (ll) { 2821 PetscCall(VecGetArrayRead(ll,&l)); 2822 PetscCall(VecGetLocalSize(ll,&lm)); 2823 PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2824 for (i=0; i<mbs; i++) { /* for each block row */ 2825 M = ai[i+1] - ai[i]; 2826 li = l + i*bs; 2827 v = aa + bs2*ai[i]; 2828 for (j=0; j<M; j++) { /* for each block */ 2829 for (k=0; k<bs2; k++) { 2830 (*v++) *= li[k%bs]; 2831 } 2832 } 2833 } 2834 PetscCall(VecRestoreArrayRead(ll,&l)); 2835 PetscCall(PetscLogFlops(a->nz)); 2836 } 2837 2838 if (rr) { 2839 PetscCall(VecGetArrayRead(rr,&r)); 2840 PetscCall(VecGetLocalSize(rr,&rn)); 2841 PetscCheck(rn == n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2842 for (i=0; i<mbs; i++) { /* for each block row */ 2843 iai = ai[i]; 2844 M = ai[i+1] - iai; 2845 v = aa + bs2*iai; 2846 for (j=0; j<M; j++) { /* for each block */ 2847 ri = r + bs*aj[iai+j]; 2848 for (k=0; k<bs; k++) { 2849 x = ri[k]; 2850 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2851 v += bs; 2852 } 2853 } 2854 } 2855 PetscCall(VecRestoreArrayRead(rr,&r)); 2856 PetscCall(PetscLogFlops(a->nz)); 2857 } 2858 PetscFunctionReturn(0); 2859 } 2860 2861 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2862 { 2863 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2864 2865 PetscFunctionBegin; 2866 info->block_size = a->bs2; 2867 info->nz_allocated = a->bs2*a->maxnz; 2868 info->nz_used = a->bs2*a->nz; 2869 info->nz_unneeded = info->nz_allocated - info->nz_used; 2870 info->assemblies = A->num_ass; 2871 info->mallocs = A->info.mallocs; 2872 info->memory = ((PetscObject)A)->mem; 2873 if (A->factortype) { 2874 info->fill_ratio_given = A->info.fill_ratio_given; 2875 info->fill_ratio_needed = A->info.fill_ratio_needed; 2876 info->factor_mallocs = A->info.factor_mallocs; 2877 } else { 2878 info->fill_ratio_given = 0; 2879 info->fill_ratio_needed = 0; 2880 info->factor_mallocs = 0; 2881 } 2882 PetscFunctionReturn(0); 2883 } 2884 2885 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2886 { 2887 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2888 2889 PetscFunctionBegin; 2890 PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs])); 2891 PetscFunctionReturn(0); 2892 } 2893 2894 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 2895 { 2896 PetscFunctionBegin; 2897 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); 2898 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; 2899 PetscFunctionReturn(0); 2900 } 2901 2902 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2903 { 2904 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2905 PetscScalar *z = NULL,sum1; 2906 const PetscScalar *xb; 2907 PetscScalar x1; 2908 const MatScalar *v,*vv; 2909 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2910 PetscBool usecprow=a->compressedrow.use; 2911 2912 PetscFunctionBegin; 2913 idx = a->j; 2914 v = a->a; 2915 if (usecprow) { 2916 mbs = a->compressedrow.nrows; 2917 ii = a->compressedrow.i; 2918 ridx = a->compressedrow.rindex; 2919 } else { 2920 mbs = a->mbs; 2921 ii = a->i; 2922 z = c; 2923 } 2924 2925 for (i=0; i<mbs; i++) { 2926 n = ii[1] - ii[0]; ii++; 2927 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2928 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2929 if (usecprow) z = c + ridx[i]; 2930 jj = idx; 2931 vv = v; 2932 for (k=0; k<cn; k++) { 2933 idx = jj; 2934 v = vv; 2935 sum1 = 0.0; 2936 for (j=0; j<n; j++) { 2937 xb = b + (*idx++); x1 = xb[0+k*bm]; 2938 sum1 += v[0]*x1; 2939 v += 1; 2940 } 2941 z[0+k*cm] = sum1; 2942 } 2943 if (!usecprow) z += 1; 2944 } 2945 PetscFunctionReturn(0); 2946 } 2947 2948 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2949 { 2950 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2951 PetscScalar *z = NULL,sum1,sum2; 2952 const PetscScalar *xb; 2953 PetscScalar x1,x2; 2954 const MatScalar *v,*vv; 2955 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 2956 PetscBool usecprow=a->compressedrow.use; 2957 2958 PetscFunctionBegin; 2959 idx = a->j; 2960 v = a->a; 2961 if (usecprow) { 2962 mbs = a->compressedrow.nrows; 2963 ii = a->compressedrow.i; 2964 ridx = a->compressedrow.rindex; 2965 } else { 2966 mbs = a->mbs; 2967 ii = a->i; 2968 z = c; 2969 } 2970 2971 for (i=0; i<mbs; i++) { 2972 n = ii[1] - ii[0]; ii++; 2973 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2974 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2975 if (usecprow) z = c + 2*ridx[i]; 2976 jj = idx; 2977 vv = v; 2978 for (k=0; k<cn; k++) { 2979 idx = jj; 2980 v = vv; 2981 sum1 = 0.0; sum2 = 0.0; 2982 for (j=0; j<n; j++) { 2983 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 2984 sum1 += v[0]*x1 + v[2]*x2; 2985 sum2 += v[1]*x1 + v[3]*x2; 2986 v += 4; 2987 } 2988 z[0+k*cm] = sum1; z[1+k*cm] = sum2; 2989 } 2990 if (!usecprow) z += 2; 2991 } 2992 PetscFunctionReturn(0); 2993 } 2994 2995 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 2996 { 2997 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2998 PetscScalar *z = NULL,sum1,sum2,sum3; 2999 const PetscScalar *xb; 3000 PetscScalar x1,x2,x3; 3001 const MatScalar *v,*vv; 3002 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3003 PetscBool usecprow=a->compressedrow.use; 3004 3005 PetscFunctionBegin; 3006 idx = a->j; 3007 v = a->a; 3008 if (usecprow) { 3009 mbs = a->compressedrow.nrows; 3010 ii = a->compressedrow.i; 3011 ridx = a->compressedrow.rindex; 3012 } else { 3013 mbs = a->mbs; 3014 ii = a->i; 3015 z = c; 3016 } 3017 3018 for (i=0; i<mbs; i++) { 3019 n = ii[1] - ii[0]; ii++; 3020 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3021 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3022 if (usecprow) z = c + 3*ridx[i]; 3023 jj = idx; 3024 vv = v; 3025 for (k=0; k<cn; k++) { 3026 idx = jj; 3027 v = vv; 3028 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 3029 for (j=0; j<n; j++) { 3030 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 3031 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 3032 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 3033 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 3034 v += 9; 3035 } 3036 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; 3037 } 3038 if (!usecprow) z += 3; 3039 } 3040 PetscFunctionReturn(0); 3041 } 3042 3043 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3044 { 3045 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3046 PetscScalar *z = NULL,sum1,sum2,sum3,sum4; 3047 const PetscScalar *xb; 3048 PetscScalar x1,x2,x3,x4; 3049 const MatScalar *v,*vv; 3050 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3051 PetscBool usecprow=a->compressedrow.use; 3052 3053 PetscFunctionBegin; 3054 idx = a->j; 3055 v = a->a; 3056 if (usecprow) { 3057 mbs = a->compressedrow.nrows; 3058 ii = a->compressedrow.i; 3059 ridx = a->compressedrow.rindex; 3060 } else { 3061 mbs = a->mbs; 3062 ii = a->i; 3063 z = c; 3064 } 3065 3066 for (i=0; i<mbs; i++) { 3067 n = ii[1] - ii[0]; ii++; 3068 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3069 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3070 if (usecprow) z = c + 4*ridx[i]; 3071 jj = idx; 3072 vv = v; 3073 for (k=0; k<cn; k++) { 3074 idx = jj; 3075 v = vv; 3076 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 3077 for (j=0; j<n; j++) { 3078 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 3079 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3080 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3081 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3082 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3083 v += 16; 3084 } 3085 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; 3086 } 3087 if (!usecprow) z += 4; 3088 } 3089 PetscFunctionReturn(0); 3090 } 3091 3092 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 3093 { 3094 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3095 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5; 3096 const PetscScalar *xb; 3097 PetscScalar x1,x2,x3,x4,x5; 3098 const MatScalar *v,*vv; 3099 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL; 3100 PetscBool usecprow=a->compressedrow.use; 3101 3102 PetscFunctionBegin; 3103 idx = a->j; 3104 v = a->a; 3105 if (usecprow) { 3106 mbs = a->compressedrow.nrows; 3107 ii = a->compressedrow.i; 3108 ridx = a->compressedrow.rindex; 3109 } else { 3110 mbs = a->mbs; 3111 ii = a->i; 3112 z = c; 3113 } 3114 3115 for (i=0; i<mbs; i++) { 3116 n = ii[1] - ii[0]; ii++; 3117 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 3118 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 3119 if (usecprow) z = c + 5*ridx[i]; 3120 jj = idx; 3121 vv = v; 3122 for (k=0; k<cn; k++) { 3123 idx = jj; 3124 v = vv; 3125 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 3126 for (j=0; j<n; j++) { 3127 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm]; 3128 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3129 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3130 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3131 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3132 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3133 v += 25; 3134 } 3135 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5; 3136 } 3137 if (!usecprow) z += 5; 3138 } 3139 PetscFunctionReturn(0); 3140 } 3141 3142 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C) 3143 { 3144 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3145 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 3146 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 3147 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 3148 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 3149 PetscBLASInt bbs,bcn,bbm,bcm; 3150 PetscScalar *z = NULL; 3151 PetscScalar *c,*b; 3152 const MatScalar *v; 3153 const PetscInt *idx,*ii,*ridx=NULL; 3154 PetscScalar _DZero=0.0,_DOne=1.0; 3155 PetscBool usecprow=a->compressedrow.use; 3156 3157 PetscFunctionBegin; 3158 if (!cm || !cn) PetscFunctionReturn(0); 3159 PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 3160 PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 3161 PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 3162 b = bd->v; 3163 if (a->nonzerorowcnt != A->rmap->n) { 3164 PetscCall(MatZeroEntries(C)); 3165 } 3166 PetscCall(MatDenseGetArray(C,&c)); 3167 switch (bs) { 3168 case 1: 3169 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); 3170 break; 3171 case 2: 3172 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); 3173 break; 3174 case 3: 3175 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); 3176 break; 3177 case 4: 3178 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); 3179 break; 3180 case 5: 3181 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); 3182 break; 3183 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 3184 PetscCall(PetscBLASIntCast(bs,&bbs)); 3185 PetscCall(PetscBLASIntCast(cn,&bcn)); 3186 PetscCall(PetscBLASIntCast(bm,&bbm)); 3187 PetscCall(PetscBLASIntCast(cm,&bcm)); 3188 idx = a->j; 3189 v = a->a; 3190 if (usecprow) { 3191 mbs = a->compressedrow.nrows; 3192 ii = a->compressedrow.i; 3193 ridx = a->compressedrow.rindex; 3194 } else { 3195 mbs = a->mbs; 3196 ii = a->i; 3197 z = c; 3198 } 3199 for (i=0; i<mbs; i++) { 3200 n = ii[1] - ii[0]; ii++; 3201 if (usecprow) z = c + bs*ridx[i]; 3202 if (n) { 3203 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm)); 3204 v += bs2; 3205 } 3206 for (j=1; j<n; j++) { 3207 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 3208 v += bs2; 3209 } 3210 if (!usecprow) z += bs; 3211 } 3212 } 3213 PetscCall(MatDenseRestoreArray(C,&c)); 3214 PetscCall(PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn)); 3215 PetscFunctionReturn(0); 3216 } 3217