1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/impls/dense/seq/dense.h> 4 #include <../src/mat/impls/sbaij/seq/sbaij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 #include <petscbt.h> 7 #include <petscblaslapack.h> 8 9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 12 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 13 const PetscInt *idx; 14 PetscBT table_out,table_in; 15 16 PetscFunctionBegin; 17 PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->rmap->bs; 22 PetscCall(PetscBTCreate(mbs,&table_out)); 23 PetscCall(PetscMalloc1(mbs+1,&nidx)); 24 PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2)); 25 PetscCall(PetscBTCreate(mbs,&table_in)); 26 27 for (i=0; i<is_max; i++) { /* for each is */ 28 isz = 0; 29 PetscCall(PetscBTMemzero(mbs,table_out)); 30 31 /* Extract the indices, assume there can be duplicate entries */ 32 PetscCall(ISGetIndices(is[i],&idx)); 33 PetscCall(ISGetLocalSize(is[i],&n)); 34 35 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 36 bcol_max = 0; 37 for (j=0; j<n; ++j) { 38 brow = idx[j]/bs; /* convert the indices into block indices */ 39 PetscCheck(brow < mbs,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 40 if (!PetscBTLookupSet(table_out,brow)) { 41 nidx[isz++] = brow; 42 if (bcol_max < brow) bcol_max = brow; 43 } 44 } 45 PetscCall(ISRestoreIndices(is[i],&idx)); 46 PetscCall(ISDestroy(&is[i])); 47 48 k = 0; 49 for (j=0; j<ov; j++) { /* for each overlap */ 50 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 51 PetscCall(PetscBTMemzero(mbs,table_in)); 52 for (l=k; l<isz; l++) PetscCall(PetscBTSet(table_in,nidx[l])); 53 54 n = isz; /* length of the updated is[i] */ 55 for (brow=0; brow<mbs; brow++) { 56 start = ai[brow]; end = ai[brow+1]; 57 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 58 for (l = start; l<end; l++) { 59 bcol = aj[l]; 60 if (!PetscBTLookupSet(table_out,bcol)) { 61 nidx[isz++] = bcol; 62 if (bcol_max < bcol) bcol_max = bcol; 63 } 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end; l++) { 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table_in,bcol)) { 72 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i)); 85 } 86 PetscCall(PetscBTDestroy(&table_out)); 87 PetscCall(PetscFree(nidx)); 88 PetscCall(PetscFree(nidx2)); 89 PetscCall(PetscBTDestroy(&table_in)); 90 PetscFunctionReturn(0); 91 } 92 93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 94 Zero some ops' to avoid invalid usse */ 95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 96 { 97 PetscFunctionBegin; 98 PetscCall(MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE)); 99 Bseq->ops->mult = NULL; 100 Bseq->ops->multadd = NULL; 101 Bseq->ops->multtranspose = NULL; 102 Bseq->ops->multtransposeadd = NULL; 103 Bseq->ops->lufactor = NULL; 104 Bseq->ops->choleskyfactor = NULL; 105 Bseq->ops->lufactorsymbolic = NULL; 106 Bseq->ops->choleskyfactorsymbolic = NULL; 107 Bseq->ops->getinertia = NULL; 108 PetscFunctionReturn(0); 109 } 110 111 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 112 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 113 { 114 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 115 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 116 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 117 const PetscInt *irow,*icol; 118 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 119 PetscInt *aj = a->j,*ai = a->i; 120 MatScalar *mat_a; 121 Mat C; 122 PetscBool flag; 123 124 PetscFunctionBegin; 125 126 PetscCall(ISGetIndices(isrow,&irow)); 127 PetscCall(ISGetIndices(iscol,&icol)); 128 PetscCall(ISGetLocalSize(isrow,&nrows)); 129 PetscCall(ISGetLocalSize(iscol,&ncols)); 130 131 PetscCall(PetscCalloc1(1+oldcols,&smap)); 132 ssmap = smap; 133 PetscCall(PetscMalloc1(1+nrows,&lens)); 134 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 135 /* determine lens of each row */ 136 for (i=0; i<nrows; i++) { 137 kstart = ai[irow[i]]; 138 kend = kstart + a->ilen[irow[i]]; 139 lens[i] = 0; 140 for (k=kstart; k<kend; k++) { 141 if (ssmap[aj[k]]) lens[i]++; 142 } 143 } 144 /* Create and fill new matrix */ 145 if (scall == MAT_REUSE_MATRIX) { 146 c = (Mat_SeqSBAIJ*)((*B)->data); 147 148 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 149 PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag)); 150 PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 151 PetscCall(PetscArrayzero(c->ilen,c->mbs)); 152 C = *B; 153 } else { 154 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C)); 155 PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE)); 156 PetscCall(MatSetType(C,((PetscObject)A)->type_name)); 157 PetscCall(MatSeqSBAIJSetPreallocation(C,bs,0,lens)); 158 } 159 c = (Mat_SeqSBAIJ*)(C->data); 160 for (i=0; i<nrows; i++) { 161 row = irow[i]; 162 kstart = ai[row]; 163 kend = kstart + a->ilen[row]; 164 mat_i = c->i[i]; 165 mat_j = c->j + mat_i; 166 mat_a = c->a + mat_i*bs2; 167 mat_ilen = c->ilen + i; 168 for (k=kstart; k<kend; k++) { 169 if ((tcol=ssmap[a->j[k]])) { 170 *mat_j++ = tcol - 1; 171 PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2)); 172 mat_a += bs2; 173 (*mat_ilen)++; 174 } 175 } 176 } 177 /* sort */ 178 { 179 MatScalar *work; 180 181 PetscCall(PetscMalloc1(bs2,&work)); 182 for (i=0; i<nrows; i++) { 183 PetscInt ilen; 184 mat_i = c->i[i]; 185 mat_j = c->j + mat_i; 186 mat_a = c->a + mat_i*bs2; 187 ilen = c->ilen[i]; 188 PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work)); 189 } 190 PetscCall(PetscFree(work)); 191 } 192 193 /* Free work space */ 194 PetscCall(ISRestoreIndices(iscol,&icol)); 195 PetscCall(PetscFree(smap)); 196 PetscCall(PetscFree(lens)); 197 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 198 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 199 200 PetscCall(ISRestoreIndices(isrow,&irow)); 201 *B = C; 202 PetscFunctionReturn(0); 203 } 204 205 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 206 { 207 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 208 IS is1,is2; 209 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 210 const PetscInt *irow,*icol; 211 212 PetscFunctionBegin; 213 PetscCall(ISGetIndices(isrow,&irow)); 214 PetscCall(ISGetIndices(iscol,&icol)); 215 PetscCall(ISGetLocalSize(isrow,&nrows)); 216 PetscCall(ISGetLocalSize(iscol,&ncols)); 217 218 /* Verify if the indices corespond to each element in a block 219 and form the IS with compressed IS */ 220 maxmnbs = PetscMax(a->mbs,a->nbs); 221 PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary)); 222 PetscCall(PetscArrayzero(vary,a->mbs)); 223 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 224 for (i=0; i<a->mbs; i++) { 225 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 226 } 227 count = 0; 228 for (i=0; i<nrows; i++) { 229 PetscInt j = irow[i] / bs; 230 if ((vary[j]--)==bs) iary[count++] = j; 231 } 232 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1)); 233 234 PetscCall(PetscArrayzero(vary,a->nbs)); 235 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 236 for (i=0; i<a->nbs; i++) { 237 PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 238 } 239 count = 0; 240 for (i=0; i<ncols; i++) { 241 PetscInt j = icol[i] / bs; 242 if ((vary[j]--)==bs) iary[count++] = j; 243 } 244 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2)); 245 PetscCall(ISRestoreIndices(isrow,&irow)); 246 PetscCall(ISRestoreIndices(iscol,&icol)); 247 PetscCall(PetscFree2(vary,iary)); 248 249 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B)); 250 PetscCall(ISDestroy(&is1)); 251 PetscCall(ISDestroy(&is2)); 252 253 if (isrow != iscol) { 254 PetscBool isequal; 255 PetscCall(ISEqual(isrow,iscol,&isequal)); 256 if (!isequal) { 257 PetscCall(MatSeqSBAIJZeroOps_Private(*B)); 258 } 259 } 260 PetscFunctionReturn(0); 261 } 262 263 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 264 { 265 PetscInt i; 266 267 PetscFunctionBegin; 268 if (scall == MAT_INITIAL_MATRIX) { 269 PetscCall(PetscCalloc1(n+1,B)); 270 } 271 272 for (i=0; i<n; i++) { 273 PetscCall(MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i])); 274 } 275 PetscFunctionReturn(0); 276 } 277 278 /* -------------------------------------------------------*/ 279 /* Should check that shapes of vectors and matrices match */ 280 /* -------------------------------------------------------*/ 281 282 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 283 { 284 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 285 PetscScalar *z,x1,x2,zero=0.0; 286 const PetscScalar *x,*xb; 287 const MatScalar *v; 288 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 289 const PetscInt *aj=a->j,*ai=a->i,*ib; 290 PetscInt nonzerorow=0; 291 292 PetscFunctionBegin; 293 PetscCall(VecSet(zz,zero)); 294 if (!a->nz) PetscFunctionReturn(0); 295 PetscCall(VecGetArrayRead(xx,&x)); 296 PetscCall(VecGetArray(zz,&z)); 297 298 v = a->a; 299 xb = x; 300 301 for (i=0; i<mbs; i++) { 302 n = ai[1] - ai[0]; /* length of i_th block row of A */ 303 x1 = xb[0]; x2 = xb[1]; 304 ib = aj + *ai; 305 jmin = 0; 306 nonzerorow += (n>0); 307 if (*ib == i) { /* (diag of A)*x */ 308 z[2*i] += v[0]*x1 + v[2]*x2; 309 z[2*i+1] += v[2]*x1 + v[3]*x2; 310 v += 4; jmin++; 311 } 312 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 313 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 314 for (j=jmin; j<n; j++) { 315 /* (strict lower triangular part of A)*x */ 316 cval = ib[j]*2; 317 z[cval] += v[0]*x1 + v[1]*x2; 318 z[cval+1] += v[2]*x1 + v[3]*x2; 319 /* (strict upper triangular part of A)*x */ 320 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 321 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 322 v += 4; 323 } 324 xb +=2; ai++; 325 } 326 327 PetscCall(VecRestoreArrayRead(xx,&x)); 328 PetscCall(VecRestoreArray(zz,&z)); 329 PetscCall(PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 330 PetscFunctionReturn(0); 331 } 332 333 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 334 { 335 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 336 PetscScalar *z,x1,x2,x3,zero=0.0; 337 const PetscScalar *x,*xb; 338 const MatScalar *v; 339 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 340 const PetscInt *aj = a->j,*ai = a->i,*ib; 341 PetscInt nonzerorow=0; 342 343 PetscFunctionBegin; 344 PetscCall(VecSet(zz,zero)); 345 if (!a->nz) PetscFunctionReturn(0); 346 PetscCall(VecGetArrayRead(xx,&x)); 347 PetscCall(VecGetArray(zz,&z)); 348 349 v = a->a; 350 xb = x; 351 352 for (i=0; i<mbs; i++) { 353 n = ai[1] - ai[0]; /* length of i_th block row of A */ 354 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 355 ib = aj + *ai; 356 jmin = 0; 357 nonzerorow += (n>0); 358 if (*ib == i) { /* (diag of A)*x */ 359 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 360 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 361 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 362 v += 9; jmin++; 363 } 364 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 365 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 366 for (j=jmin; j<n; j++) { 367 /* (strict lower triangular part of A)*x */ 368 cval = ib[j]*3; 369 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 370 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 371 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 372 /* (strict upper triangular part of A)*x */ 373 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 374 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 375 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 376 v += 9; 377 } 378 xb +=3; ai++; 379 } 380 381 PetscCall(VecRestoreArrayRead(xx,&x)); 382 PetscCall(VecRestoreArray(zz,&z)); 383 PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 384 PetscFunctionReturn(0); 385 } 386 387 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 388 { 389 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 390 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 391 const PetscScalar *x,*xb; 392 const MatScalar *v; 393 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 394 const PetscInt *aj = a->j,*ai = a->i,*ib; 395 PetscInt nonzerorow = 0; 396 397 PetscFunctionBegin; 398 PetscCall(VecSet(zz,zero)); 399 if (!a->nz) PetscFunctionReturn(0); 400 PetscCall(VecGetArrayRead(xx,&x)); 401 PetscCall(VecGetArray(zz,&z)); 402 403 v = a->a; 404 xb = x; 405 406 for (i=0; i<mbs; i++) { 407 n = ai[1] - ai[0]; /* length of i_th block row of A */ 408 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 409 ib = aj + *ai; 410 jmin = 0; 411 nonzerorow += (n>0); 412 if (*ib == i) { /* (diag of A)*x */ 413 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 414 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 415 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 416 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 417 v += 16; jmin++; 418 } 419 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 420 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 421 for (j=jmin; j<n; j++) { 422 /* (strict lower triangular part of A)*x */ 423 cval = ib[j]*4; 424 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 425 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 426 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 427 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 428 /* (strict upper triangular part of A)*x */ 429 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 430 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 431 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 432 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 433 v += 16; 434 } 435 xb +=4; ai++; 436 } 437 438 PetscCall(VecRestoreArrayRead(xx,&x)); 439 PetscCall(VecRestoreArray(zz,&z)); 440 PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 441 PetscFunctionReturn(0); 442 } 443 444 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 445 { 446 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 447 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 448 const PetscScalar *x,*xb; 449 const MatScalar *v; 450 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 451 const PetscInt *aj = a->j,*ai = a->i,*ib; 452 PetscInt nonzerorow=0; 453 454 PetscFunctionBegin; 455 PetscCall(VecSet(zz,zero)); 456 if (!a->nz) PetscFunctionReturn(0); 457 PetscCall(VecGetArrayRead(xx,&x)); 458 PetscCall(VecGetArray(zz,&z)); 459 460 v = a->a; 461 xb = x; 462 463 for (i=0; i<mbs; i++) { 464 n = ai[1] - ai[0]; /* length of i_th block row of A */ 465 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 466 ib = aj + *ai; 467 jmin = 0; 468 nonzerorow += (n>0); 469 if (*ib == i) { /* (diag of A)*x */ 470 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 471 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 472 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 473 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 474 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 475 v += 25; jmin++; 476 } 477 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 478 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 479 for (j=jmin; j<n; j++) { 480 /* (strict lower triangular part of A)*x */ 481 cval = ib[j]*5; 482 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 483 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 484 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 485 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 486 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 487 /* (strict upper triangular part of A)*x */ 488 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 489 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 490 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 491 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 492 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 493 v += 25; 494 } 495 xb +=5; ai++; 496 } 497 498 PetscCall(VecRestoreArrayRead(xx,&x)); 499 PetscCall(VecRestoreArray(zz,&z)); 500 PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 501 PetscFunctionReturn(0); 502 } 503 504 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 505 { 506 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 507 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 508 const PetscScalar *x,*xb; 509 const MatScalar *v; 510 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 511 const PetscInt *aj=a->j,*ai=a->i,*ib; 512 PetscInt nonzerorow=0; 513 514 PetscFunctionBegin; 515 PetscCall(VecSet(zz,zero)); 516 if (!a->nz) PetscFunctionReturn(0); 517 PetscCall(VecGetArrayRead(xx,&x)); 518 PetscCall(VecGetArray(zz,&z)); 519 520 v = a->a; 521 xb = x; 522 523 for (i=0; i<mbs; i++) { 524 n = ai[1] - ai[0]; /* length of i_th block row of A */ 525 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 526 ib = aj + *ai; 527 jmin = 0; 528 nonzerorow += (n>0); 529 if (*ib == i) { /* (diag of A)*x */ 530 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 531 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 532 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 533 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 534 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 535 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 536 v += 36; jmin++; 537 } 538 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 539 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 540 for (j=jmin; j<n; j++) { 541 /* (strict lower triangular part of A)*x */ 542 cval = ib[j]*6; 543 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 544 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 545 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 546 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 547 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 548 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 549 /* (strict upper triangular part of A)*x */ 550 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 551 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 552 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 553 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 554 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 555 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 556 v += 36; 557 } 558 xb +=6; ai++; 559 } 560 561 PetscCall(VecRestoreArrayRead(xx,&x)); 562 PetscCall(VecRestoreArray(zz,&z)); 563 PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 564 PetscFunctionReturn(0); 565 } 566 567 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 568 { 569 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 570 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 571 const PetscScalar *x,*xb; 572 const MatScalar *v; 573 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 574 const PetscInt *aj=a->j,*ai=a->i,*ib; 575 PetscInt nonzerorow=0; 576 577 PetscFunctionBegin; 578 PetscCall(VecSet(zz,zero)); 579 if (!a->nz) PetscFunctionReturn(0); 580 PetscCall(VecGetArrayRead(xx,&x)); 581 PetscCall(VecGetArray(zz,&z)); 582 583 v = a->a; 584 xb = x; 585 586 for (i=0; i<mbs; i++) { 587 n = ai[1] - ai[0]; /* length of i_th block row of A */ 588 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 589 ib = aj + *ai; 590 jmin = 0; 591 nonzerorow += (n>0); 592 if (*ib == i) { /* (diag of A)*x */ 593 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 594 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 595 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 596 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 597 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 598 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 599 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 600 v += 49; jmin++; 601 } 602 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 603 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 604 for (j=jmin; j<n; j++) { 605 /* (strict lower triangular part of A)*x */ 606 cval = ib[j]*7; 607 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 608 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 609 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 610 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 611 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 612 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 613 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 614 /* (strict upper triangular part of A)*x */ 615 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 616 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 617 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 618 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 619 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 620 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 621 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 622 v += 49; 623 } 624 xb +=7; ai++; 625 } 626 PetscCall(VecRestoreArrayRead(xx,&x)); 627 PetscCall(VecRestoreArray(zz,&z)); 628 PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow)); 629 PetscFunctionReturn(0); 630 } 631 632 /* 633 This will not work with MatScalar == float because it calls the BLAS 634 */ 635 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 636 { 637 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 638 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 639 const PetscScalar *x,*x_ptr,*xb; 640 const MatScalar *v; 641 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 642 const PetscInt *idx,*aj,*ii; 643 PetscInt nonzerorow=0; 644 645 PetscFunctionBegin; 646 PetscCall(VecSet(zz,zero)); 647 if (!a->nz) PetscFunctionReturn(0); 648 PetscCall(VecGetArrayRead(xx,&x)); 649 PetscCall(VecGetArray(zz,&z)); 650 651 x_ptr = x; 652 z_ptr = z; 653 654 aj = a->j; 655 v = a->a; 656 ii = a->i; 657 658 if (!a->mult_work) { 659 PetscCall(PetscMalloc1(A->rmap->N+1,&a->mult_work)); 660 } 661 work = a->mult_work; 662 663 for (i=0; i<mbs; i++) { 664 n = ii[1] - ii[0]; ncols = n*bs; 665 workt = work; idx=aj+ii[0]; 666 nonzerorow += (n>0); 667 668 /* upper triangular part */ 669 for (j=0; j<n; j++) { 670 xb = x_ptr + bs*(*idx++); 671 for (k=0; k<bs; k++) workt[k] = xb[k]; 672 workt += bs; 673 } 674 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 675 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 676 677 /* strict lower triangular part */ 678 idx = aj+ii[0]; 679 if (n && *idx == i) { 680 ncols -= bs; v += bs2; idx++; n--; 681 } 682 683 if (ncols > 0) { 684 workt = work; 685 PetscCall(PetscArrayzero(workt,ncols)); 686 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 687 for (j=0; j<n; j++) { 688 zb = z_ptr + bs*(*idx++); 689 for (k=0; k<bs; k++) zb[k] += workt[k]; 690 workt += bs; 691 } 692 } 693 x += bs; v += n*bs2; z += bs; ii++; 694 } 695 696 PetscCall(VecRestoreArrayRead(xx,&x)); 697 PetscCall(VecRestoreArray(zz,&z)); 698 PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow)); 699 PetscFunctionReturn(0); 700 } 701 702 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 703 { 704 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 705 PetscScalar *z,x1; 706 const PetscScalar *x,*xb; 707 const MatScalar *v; 708 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 709 const PetscInt *aj=a->j,*ai=a->i,*ib; 710 PetscInt nonzerorow=0; 711 #if defined(PETSC_USE_COMPLEX) 712 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 713 #else 714 const int aconj = 0; 715 #endif 716 717 PetscFunctionBegin; 718 PetscCall(VecCopy(yy,zz)); 719 if (!a->nz) PetscFunctionReturn(0); 720 PetscCall(VecGetArrayRead(xx,&x)); 721 PetscCall(VecGetArray(zz,&z)); 722 v = a->a; 723 xb = x; 724 725 for (i=0; i<mbs; i++) { 726 n = ai[1] - ai[0]; /* length of i_th row of A */ 727 x1 = xb[0]; 728 ib = aj + *ai; 729 jmin = 0; 730 nonzerorow += (n>0); 731 if (n && *ib == i) { /* (diag of A)*x */ 732 z[i] += *v++ * x[*ib++]; jmin++; 733 } 734 if (aconj) { 735 for (j=jmin; j<n; j++) { 736 cval = *ib; 737 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */ 738 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 739 } 740 } else { 741 for (j=jmin; j<n; j++) { 742 cval = *ib; 743 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 744 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 745 } 746 } 747 xb++; ai++; 748 } 749 750 PetscCall(VecRestoreArrayRead(xx,&x)); 751 PetscCall(VecRestoreArray(zz,&z)); 752 753 PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow))); 754 PetscFunctionReturn(0); 755 } 756 757 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 758 { 759 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 760 PetscScalar *z,x1,x2; 761 const PetscScalar *x,*xb; 762 const MatScalar *v; 763 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 764 const PetscInt *aj=a->j,*ai=a->i,*ib; 765 PetscInt nonzerorow=0; 766 767 PetscFunctionBegin; 768 PetscCall(VecCopy(yy,zz)); 769 if (!a->nz) PetscFunctionReturn(0); 770 PetscCall(VecGetArrayRead(xx,&x)); 771 PetscCall(VecGetArray(zz,&z)); 772 773 v = a->a; 774 xb = x; 775 776 for (i=0; i<mbs; i++) { 777 n = ai[1] - ai[0]; /* length of i_th block row of A */ 778 x1 = xb[0]; x2 = xb[1]; 779 ib = aj + *ai; 780 jmin = 0; 781 nonzerorow += (n>0); 782 if (n && *ib == i) { /* (diag of A)*x */ 783 z[2*i] += v[0]*x1 + v[2]*x2; 784 z[2*i+1] += v[2]*x1 + v[3]*x2; 785 v += 4; jmin++; 786 } 787 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 788 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 789 for (j=jmin; j<n; j++) { 790 /* (strict lower triangular part of A)*x */ 791 cval = ib[j]*2; 792 z[cval] += v[0]*x1 + v[1]*x2; 793 z[cval+1] += v[2]*x1 + v[3]*x2; 794 /* (strict upper triangular part of A)*x */ 795 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 796 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 797 v += 4; 798 } 799 xb +=2; ai++; 800 } 801 PetscCall(VecRestoreArrayRead(xx,&x)); 802 PetscCall(VecRestoreArray(zz,&z)); 803 804 PetscCall(PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow))); 805 PetscFunctionReturn(0); 806 } 807 808 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 809 { 810 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 811 PetscScalar *z,x1,x2,x3; 812 const PetscScalar *x,*xb; 813 const MatScalar *v; 814 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 815 const PetscInt *aj=a->j,*ai=a->i,*ib; 816 PetscInt nonzerorow=0; 817 818 PetscFunctionBegin; 819 PetscCall(VecCopy(yy,zz)); 820 if (!a->nz) PetscFunctionReturn(0); 821 PetscCall(VecGetArrayRead(xx,&x)); 822 PetscCall(VecGetArray(zz,&z)); 823 824 v = a->a; 825 xb = x; 826 827 for (i=0; i<mbs; i++) { 828 n = ai[1] - ai[0]; /* length of i_th block row of A */ 829 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 830 ib = aj + *ai; 831 jmin = 0; 832 nonzerorow += (n>0); 833 if (n && *ib == i) { /* (diag of A)*x */ 834 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 835 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 836 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 837 v += 9; jmin++; 838 } 839 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 840 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 841 for (j=jmin; j<n; j++) { 842 /* (strict lower triangular part of A)*x */ 843 cval = ib[j]*3; 844 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 845 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 846 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 847 /* (strict upper triangular part of A)*x */ 848 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 849 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 850 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 851 v += 9; 852 } 853 xb +=3; ai++; 854 } 855 856 PetscCall(VecRestoreArrayRead(xx,&x)); 857 PetscCall(VecRestoreArray(zz,&z)); 858 859 PetscCall(PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow))); 860 PetscFunctionReturn(0); 861 } 862 863 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 864 { 865 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 866 PetscScalar *z,x1,x2,x3,x4; 867 const PetscScalar *x,*xb; 868 const MatScalar *v; 869 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 870 const PetscInt *aj=a->j,*ai=a->i,*ib; 871 PetscInt nonzerorow=0; 872 873 PetscFunctionBegin; 874 PetscCall(VecCopy(yy,zz)); 875 if (!a->nz) PetscFunctionReturn(0); 876 PetscCall(VecGetArrayRead(xx,&x)); 877 PetscCall(VecGetArray(zz,&z)); 878 879 v = a->a; 880 xb = x; 881 882 for (i=0; i<mbs; i++) { 883 n = ai[1] - ai[0]; /* length of i_th block row of A */ 884 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 885 ib = aj + *ai; 886 jmin = 0; 887 nonzerorow += (n>0); 888 if (n && *ib == i) { /* (diag of A)*x */ 889 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 890 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 891 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 892 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 893 v += 16; jmin++; 894 } 895 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 896 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 897 for (j=jmin; j<n; j++) { 898 /* (strict lower triangular part of A)*x */ 899 cval = ib[j]*4; 900 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 901 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 902 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 903 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 904 /* (strict upper triangular part of A)*x */ 905 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 906 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 907 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 908 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 909 v += 16; 910 } 911 xb +=4; ai++; 912 } 913 914 PetscCall(VecRestoreArrayRead(xx,&x)); 915 PetscCall(VecRestoreArray(zz,&z)); 916 917 PetscCall(PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow))); 918 PetscFunctionReturn(0); 919 } 920 921 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 922 { 923 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 924 PetscScalar *z,x1,x2,x3,x4,x5; 925 const PetscScalar *x,*xb; 926 const MatScalar *v; 927 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 928 const PetscInt *aj=a->j,*ai=a->i,*ib; 929 PetscInt nonzerorow=0; 930 931 PetscFunctionBegin; 932 PetscCall(VecCopy(yy,zz)); 933 if (!a->nz) PetscFunctionReturn(0); 934 PetscCall(VecGetArrayRead(xx,&x)); 935 PetscCall(VecGetArray(zz,&z)); 936 937 v = a->a; 938 xb = x; 939 940 for (i=0; i<mbs; i++) { 941 n = ai[1] - ai[0]; /* length of i_th block row of A */ 942 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 943 ib = aj + *ai; 944 jmin = 0; 945 nonzerorow += (n>0); 946 if (n && *ib == i) { /* (diag of A)*x */ 947 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 948 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 949 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 950 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 951 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 952 v += 25; jmin++; 953 } 954 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 955 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 956 for (j=jmin; j<n; j++) { 957 /* (strict lower triangular part of A)*x */ 958 cval = ib[j]*5; 959 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 960 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 961 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 962 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 963 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 964 /* (strict upper triangular part of A)*x */ 965 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 966 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 967 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 968 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 969 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 970 v += 25; 971 } 972 xb +=5; ai++; 973 } 974 975 PetscCall(VecRestoreArrayRead(xx,&x)); 976 PetscCall(VecRestoreArray(zz,&z)); 977 978 PetscCall(PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow))); 979 PetscFunctionReturn(0); 980 } 981 982 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 983 { 984 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 985 PetscScalar *z,x1,x2,x3,x4,x5,x6; 986 const PetscScalar *x,*xb; 987 const MatScalar *v; 988 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 989 const PetscInt *aj=a->j,*ai=a->i,*ib; 990 PetscInt nonzerorow=0; 991 992 PetscFunctionBegin; 993 PetscCall(VecCopy(yy,zz)); 994 if (!a->nz) PetscFunctionReturn(0); 995 PetscCall(VecGetArrayRead(xx,&x)); 996 PetscCall(VecGetArray(zz,&z)); 997 998 v = a->a; 999 xb = x; 1000 1001 for (i=0; i<mbs; i++) { 1002 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1003 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1004 ib = aj + *ai; 1005 jmin = 0; 1006 nonzerorow += (n>0); 1007 if (n && *ib == i) { /* (diag of A)*x */ 1008 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1009 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1010 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1011 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1012 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1013 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1014 v += 36; jmin++; 1015 } 1016 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1017 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1018 for (j=jmin; j<n; j++) { 1019 /* (strict lower triangular part of A)*x */ 1020 cval = ib[j]*6; 1021 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1022 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1023 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1024 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1025 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1026 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1027 /* (strict upper triangular part of A)*x */ 1028 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1029 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1030 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1031 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1032 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1033 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1034 v += 36; 1035 } 1036 xb +=6; ai++; 1037 } 1038 1039 PetscCall(VecRestoreArrayRead(xx,&x)); 1040 PetscCall(VecRestoreArray(zz,&z)); 1041 1042 PetscCall(PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow))); 1043 PetscFunctionReturn(0); 1044 } 1045 1046 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1047 { 1048 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1049 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1050 const PetscScalar *x,*xb; 1051 const MatScalar *v; 1052 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1053 const PetscInt *aj=a->j,*ai=a->i,*ib; 1054 PetscInt nonzerorow=0; 1055 1056 PetscFunctionBegin; 1057 PetscCall(VecCopy(yy,zz)); 1058 if (!a->nz) PetscFunctionReturn(0); 1059 PetscCall(VecGetArrayRead(xx,&x)); 1060 PetscCall(VecGetArray(zz,&z)); 1061 1062 v = a->a; 1063 xb = x; 1064 1065 for (i=0; i<mbs; i++) { 1066 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1067 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1068 ib = aj + *ai; 1069 jmin = 0; 1070 nonzerorow += (n>0); 1071 if (n && *ib == i) { /* (diag of A)*x */ 1072 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1073 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1074 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1075 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1076 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1077 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1078 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1079 v += 49; jmin++; 1080 } 1081 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1082 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1083 for (j=jmin; j<n; j++) { 1084 /* (strict lower triangular part of A)*x */ 1085 cval = ib[j]*7; 1086 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1087 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1088 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1089 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1090 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1091 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1092 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1093 /* (strict upper triangular part of A)*x */ 1094 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1095 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1096 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1097 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1098 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1099 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1100 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1101 v += 49; 1102 } 1103 xb +=7; ai++; 1104 } 1105 1106 PetscCall(VecRestoreArrayRead(xx,&x)); 1107 PetscCall(VecRestoreArray(zz,&z)); 1108 1109 PetscCall(PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow))); 1110 PetscFunctionReturn(0); 1111 } 1112 1113 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1114 { 1115 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1116 PetscScalar *z,*z_ptr=NULL,*zb,*work,*workt; 1117 const PetscScalar *x,*x_ptr,*xb; 1118 const MatScalar *v; 1119 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1120 const PetscInt *idx,*aj,*ii; 1121 PetscInt nonzerorow=0; 1122 1123 PetscFunctionBegin; 1124 PetscCall(VecCopy(yy,zz)); 1125 if (!a->nz) PetscFunctionReturn(0); 1126 PetscCall(VecGetArrayRead(xx,&x)); x_ptr=x; 1127 PetscCall(VecGetArray(zz,&z)); z_ptr=z; 1128 1129 aj = a->j; 1130 v = a->a; 1131 ii = a->i; 1132 1133 if (!a->mult_work) { 1134 PetscCall(PetscMalloc1(A->rmap->n+1,&a->mult_work)); 1135 } 1136 work = a->mult_work; 1137 1138 for (i=0; i<mbs; i++) { 1139 n = ii[1] - ii[0]; ncols = n*bs; 1140 workt = work; idx=aj+ii[0]; 1141 nonzerorow += (n>0); 1142 1143 /* upper triangular part */ 1144 for (j=0; j<n; j++) { 1145 xb = x_ptr + bs*(*idx++); 1146 for (k=0; k<bs; k++) workt[k] = xb[k]; 1147 workt += bs; 1148 } 1149 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1150 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1151 1152 /* strict lower triangular part */ 1153 idx = aj+ii[0]; 1154 if (n && *idx == i) { 1155 ncols -= bs; v += bs2; idx++; n--; 1156 } 1157 if (ncols > 0) { 1158 workt = work; 1159 PetscCall(PetscArrayzero(workt,ncols)); 1160 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1161 for (j=0; j<n; j++) { 1162 zb = z_ptr + bs*(*idx++); 1163 for (k=0; k<bs; k++) zb[k] += workt[k]; 1164 workt += bs; 1165 } 1166 } 1167 1168 x += bs; v += n*bs2; z += bs; ii++; 1169 } 1170 1171 PetscCall(VecRestoreArrayRead(xx,&x)); 1172 PetscCall(VecRestoreArray(zz,&z)); 1173 1174 PetscCall(PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow))); 1175 PetscFunctionReturn(0); 1176 } 1177 1178 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1179 { 1180 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1181 PetscScalar oalpha = alpha; 1182 PetscBLASInt one = 1,totalnz; 1183 1184 PetscFunctionBegin; 1185 PetscCall(PetscBLASIntCast(a->bs2*a->nz,&totalnz)); 1186 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1187 PetscCall(PetscLogFlops(totalnz)); 1188 PetscFunctionReturn(0); 1189 } 1190 1191 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1192 { 1193 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1194 const MatScalar *v = a->a; 1195 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1196 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1197 const PetscInt *aj=a->j,*col; 1198 1199 PetscFunctionBegin; 1200 if (!a->nz) { 1201 *norm = 0.0; 1202 PetscFunctionReturn(0); 1203 } 1204 if (type == NORM_FROBENIUS) { 1205 for (k=0; k<mbs; k++) { 1206 jmin = a->i[k]; 1207 jmax = a->i[k+1]; 1208 col = aj + jmin; 1209 if (jmax-jmin > 0 && *col == k) { /* diagonal block */ 1210 for (i=0; i<bs2; i++) { 1211 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1212 } 1213 jmin++; 1214 } 1215 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1216 for (i=0; i<bs2; i++) { 1217 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1218 } 1219 } 1220 } 1221 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1222 PetscCall(PetscLogFlops(2.0*bs2*a->nz)); 1223 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1224 PetscCall(PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl)); 1225 for (i=0; i<mbs; i++) jl[i] = mbs; 1226 il[0] = 0; 1227 1228 *norm = 0.0; 1229 for (k=0; k<mbs; k++) { /* k_th block row */ 1230 for (j=0; j<bs; j++) sum[j]=0.0; 1231 /*-- col sum --*/ 1232 i = jl[k]; /* first |A(i,k)| to be added */ 1233 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1234 at step k */ 1235 while (i<mbs) { 1236 nexti = jl[i]; /* next block row to be added */ 1237 ik = il[i]; /* block index of A(i,k) in the array a */ 1238 for (j=0; j<bs; j++) { 1239 v = a->a + ik*bs2 + j*bs; 1240 for (k1=0; k1<bs; k1++) { 1241 sum[j] += PetscAbsScalar(*v); v++; 1242 } 1243 } 1244 /* update il, jl */ 1245 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1246 jmax = a->i[i+1]; 1247 if (jmin < jmax) { 1248 il[i] = jmin; 1249 j = a->j[jmin]; 1250 jl[i] = jl[j]; jl[j]=i; 1251 } 1252 i = nexti; 1253 } 1254 /*-- row sum --*/ 1255 jmin = a->i[k]; 1256 jmax = a->i[k+1]; 1257 for (i=jmin; i<jmax; i++) { 1258 for (j=0; j<bs; j++) { 1259 v = a->a + i*bs2 + j; 1260 for (k1=0; k1<bs; k1++) { 1261 sum[j] += PetscAbsScalar(*v); v += bs; 1262 } 1263 } 1264 } 1265 /* add k_th block row to il, jl */ 1266 col = aj+jmin; 1267 if (jmax - jmin > 0 && *col == k) jmin++; 1268 if (jmin < jmax) { 1269 il[k] = jmin; 1270 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1271 } 1272 for (j=0; j<bs; j++) { 1273 if (sum[j] > *norm) *norm = sum[j]; 1274 } 1275 } 1276 PetscCall(PetscFree3(sum,il,jl)); 1277 PetscCall(PetscLogFlops(PetscMax(mbs*a->nz-1,0))); 1278 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1279 PetscFunctionReturn(0); 1280 } 1281 1282 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1283 { 1284 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1285 1286 PetscFunctionBegin; 1287 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1288 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1289 *flg = PETSC_FALSE; 1290 PetscFunctionReturn(0); 1291 } 1292 1293 /* if the a->i are the same */ 1294 PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg)); 1295 if (!*flg) PetscFunctionReturn(0); 1296 1297 /* if a->j are the same */ 1298 PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg)); 1299 if (!*flg) PetscFunctionReturn(0); 1300 1301 /* if a->a are the same */ 1302 PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg)); 1303 PetscFunctionReturn(0); 1304 } 1305 1306 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1307 { 1308 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1309 PetscInt i,j,k,row,bs,ambs,bs2; 1310 const PetscInt *ai,*aj; 1311 PetscScalar *x,zero = 0.0; 1312 const MatScalar *aa,*aa_j; 1313 1314 PetscFunctionBegin; 1315 bs = A->rmap->bs; 1316 PetscCheck(!A->factortype || bs <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1317 1318 aa = a->a; 1319 ambs = a->mbs; 1320 1321 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1322 PetscInt *diag=a->diag; 1323 aa = a->a; 1324 ambs = a->mbs; 1325 PetscCall(VecGetArray(v,&x)); 1326 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1327 PetscCall(VecRestoreArray(v,&x)); 1328 PetscFunctionReturn(0); 1329 } 1330 1331 ai = a->i; 1332 aj = a->j; 1333 bs2 = a->bs2; 1334 PetscCall(VecSet(v,zero)); 1335 if (!a->nz) PetscFunctionReturn(0); 1336 PetscCall(VecGetArray(v,&x)); 1337 for (i=0; i<ambs; i++) { 1338 j = ai[i]; 1339 if (aj[j] == i) { /* if this is a diagonal element */ 1340 row = i*bs; 1341 aa_j = aa + j*bs2; 1342 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1343 } 1344 } 1345 PetscCall(VecRestoreArray(v,&x)); 1346 PetscFunctionReturn(0); 1347 } 1348 1349 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1350 { 1351 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1352 PetscScalar x; 1353 const PetscScalar *l,*li,*ri; 1354 MatScalar *aa,*v; 1355 PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2; 1356 const PetscInt *ai,*aj; 1357 PetscBool flg; 1358 1359 PetscFunctionBegin; 1360 if (ll != rr) { 1361 PetscCall(VecEqual(ll,rr,&flg)); 1362 PetscCheck(flg,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same"); 1363 } 1364 if (!ll) PetscFunctionReturn(0); 1365 ai = a->i; 1366 aj = a->j; 1367 aa = a->a; 1368 m = A->rmap->N; 1369 bs = A->rmap->bs; 1370 mbs = a->mbs; 1371 bs2 = a->bs2; 1372 1373 PetscCall(VecGetArrayRead(ll,&l)); 1374 PetscCall(VecGetLocalSize(ll,&lm)); 1375 PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1376 for (i=0; i<mbs; i++) { /* for each block row */ 1377 M = ai[i+1] - ai[i]; 1378 li = l + i*bs; 1379 v = aa + bs2*ai[i]; 1380 for (j=0; j<M; j++) { /* for each block */ 1381 ri = l + bs*aj[ai[i]+j]; 1382 for (k=0; k<bs; k++) { 1383 x = ri[k]; 1384 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1385 } 1386 } 1387 } 1388 PetscCall(VecRestoreArrayRead(ll,&l)); 1389 PetscCall(PetscLogFlops(2.0*a->nz)); 1390 PetscFunctionReturn(0); 1391 } 1392 1393 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1394 { 1395 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1396 1397 PetscFunctionBegin; 1398 info->block_size = a->bs2; 1399 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1400 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1401 info->nz_unneeded = info->nz_allocated - info->nz_used; 1402 info->assemblies = A->num_ass; 1403 info->mallocs = A->info.mallocs; 1404 info->memory = ((PetscObject)A)->mem; 1405 if (A->factortype) { 1406 info->fill_ratio_given = A->info.fill_ratio_given; 1407 info->fill_ratio_needed = A->info.fill_ratio_needed; 1408 info->factor_mallocs = A->info.factor_mallocs; 1409 } else { 1410 info->fill_ratio_given = 0; 1411 info->fill_ratio_needed = 0; 1412 info->factor_mallocs = 0; 1413 } 1414 PetscFunctionReturn(0); 1415 } 1416 1417 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1418 { 1419 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1420 1421 PetscFunctionBegin; 1422 PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs])); 1423 PetscFunctionReturn(0); 1424 } 1425 1426 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1427 { 1428 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1429 PetscInt i,j,n,row,col,bs,mbs; 1430 const PetscInt *ai,*aj; 1431 PetscReal atmp; 1432 const MatScalar *aa; 1433 PetscScalar *x; 1434 PetscInt ncols,brow,bcol,krow,kcol; 1435 1436 PetscFunctionBegin; 1437 PetscCheck(!idx,PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1438 PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1439 bs = A->rmap->bs; 1440 aa = a->a; 1441 ai = a->i; 1442 aj = a->j; 1443 mbs = a->mbs; 1444 1445 PetscCall(VecSet(v,0.0)); 1446 PetscCall(VecGetArray(v,&x)); 1447 PetscCall(VecGetLocalSize(v,&n)); 1448 PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1449 for (i=0; i<mbs; i++) { 1450 ncols = ai[1] - ai[0]; ai++; 1451 brow = bs*i; 1452 for (j=0; j<ncols; j++) { 1453 bcol = bs*(*aj); 1454 for (kcol=0; kcol<bs; kcol++) { 1455 col = bcol + kcol; /* col index */ 1456 for (krow=0; krow<bs; krow++) { 1457 atmp = PetscAbsScalar(*aa); aa++; 1458 row = brow + krow; /* row index */ 1459 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1460 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1461 } 1462 } 1463 aj++; 1464 } 1465 } 1466 PetscCall(VecRestoreArray(v,&x)); 1467 PetscFunctionReturn(0); 1468 } 1469 1470 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C) 1471 { 1472 PetscFunctionBegin; 1473 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C)); 1474 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense; 1475 PetscFunctionReturn(0); 1476 } 1477 1478 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1479 { 1480 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1481 PetscScalar *z = c; 1482 const PetscScalar *xb; 1483 PetscScalar x1; 1484 const MatScalar *v = a->a,*vv; 1485 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1486 #if defined(PETSC_USE_COMPLEX) 1487 const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 1488 #else 1489 const int aconj = 0; 1490 #endif 1491 1492 PetscFunctionBegin; 1493 for (i=0; i<mbs; i++) { 1494 n = ii[1] - ii[0]; ii++; 1495 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1496 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1497 jj = idx; 1498 vv = v; 1499 for (k=0; k<cn; k++) { 1500 idx = jj; 1501 v = vv; 1502 for (j=0; j<n; j++) { 1503 xb = b + (*idx); x1 = xb[0+k*bm]; 1504 z[0+k*cm] += v[0]*x1; 1505 if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm]; 1506 v += 1; 1507 ++idx; 1508 } 1509 } 1510 z += 1; 1511 } 1512 PetscFunctionReturn(0); 1513 } 1514 1515 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1516 { 1517 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1518 PetscScalar *z = c; 1519 const PetscScalar *xb; 1520 PetscScalar x1,x2; 1521 const MatScalar *v = a->a,*vv; 1522 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1523 1524 PetscFunctionBegin; 1525 for (i=0; i<mbs; i++) { 1526 n = ii[1] - ii[0]; ii++; 1527 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1528 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1529 jj = idx; 1530 vv = v; 1531 for (k=0; k<cn; k++) { 1532 idx = jj; 1533 v = vv; 1534 for (j=0; j<n; j++) { 1535 xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; 1536 z[0+k*cm] += v[0]*x1 + v[2]*x2; 1537 z[1+k*cm] += v[1]*x1 + v[3]*x2; 1538 if (*idx != i) { 1539 c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm]; 1540 c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm]; 1541 } 1542 v += 4; 1543 ++idx; 1544 } 1545 } 1546 z += 2; 1547 } 1548 PetscFunctionReturn(0); 1549 } 1550 1551 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1552 { 1553 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1554 PetscScalar *z = c; 1555 const PetscScalar *xb; 1556 PetscScalar x1,x2,x3; 1557 const MatScalar *v = a->a,*vv; 1558 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1559 1560 PetscFunctionBegin; 1561 for (i=0; i<mbs; i++) { 1562 n = ii[1] - ii[0]; ii++; 1563 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1564 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1565 jj = idx; 1566 vv = v; 1567 for (k=0; k<cn; k++) { 1568 idx = jj; 1569 v = vv; 1570 for (j=0; j<n; j++) { 1571 xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; 1572 z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3; 1573 z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3; 1574 z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3; 1575 if (*idx != i) { 1576 c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm]; 1577 c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm]; 1578 c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm]; 1579 } 1580 v += 9; 1581 ++idx; 1582 } 1583 } 1584 z += 3; 1585 } 1586 PetscFunctionReturn(0); 1587 } 1588 1589 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1590 { 1591 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1592 PetscScalar *z = c; 1593 const PetscScalar *xb; 1594 PetscScalar x1,x2,x3,x4; 1595 const MatScalar *v = a->a,*vv; 1596 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1597 1598 PetscFunctionBegin; 1599 for (i=0; i<mbs; i++) { 1600 n = ii[1] - ii[0]; ii++; 1601 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1602 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1603 jj = idx; 1604 vv = v; 1605 for (k=0; k<cn; k++) { 1606 idx = jj; 1607 v = vv; 1608 for (j=0; j<n; j++) { 1609 xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; 1610 z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1611 z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1612 z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1613 z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1614 if (*idx != i) { 1615 c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm]; 1616 c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm]; 1617 c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm]; 1618 c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm]; 1619 } 1620 v += 16; 1621 ++idx; 1622 } 1623 } 1624 z += 4; 1625 } 1626 PetscFunctionReturn(0); 1627 } 1628 1629 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn) 1630 { 1631 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1632 PetscScalar *z = c; 1633 const PetscScalar *xb; 1634 PetscScalar x1,x2,x3,x4,x5; 1635 const MatScalar *v = a->a,*vv; 1636 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k; 1637 1638 PetscFunctionBegin; 1639 for (i=0; i<mbs; i++) { 1640 n = ii[1] - ii[0]; ii++; 1641 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1642 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1643 jj = idx; 1644 vv = v; 1645 for (k=0; k<cn; k++) { 1646 idx = jj; 1647 v = vv; 1648 for (j=0; j<n; j++) { 1649 xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm]; 1650 z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1651 z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1652 z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1653 z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1654 z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1655 if (*idx != i) { 1656 c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm]; 1657 c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm]; 1658 c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm]; 1659 c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm]; 1660 c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm]; 1661 } 1662 v += 25; 1663 ++idx; 1664 } 1665 } 1666 z += 5; 1667 } 1668 PetscFunctionReturn(0); 1669 } 1670 1671 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C) 1672 { 1673 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1674 Mat_SeqDense *bd = (Mat_SeqDense*)B->data; 1675 Mat_SeqDense *cd = (Mat_SeqDense*)C->data; 1676 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda; 1677 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1678 PetscBLASInt bbs,bcn,bbm,bcm; 1679 PetscScalar *z = NULL; 1680 PetscScalar *c,*b; 1681 const MatScalar *v; 1682 const PetscInt *idx,*ii; 1683 PetscScalar _DOne=1.0; 1684 1685 PetscFunctionBegin; 1686 if (!cm || !cn) PetscFunctionReturn(0); 1687 PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n); 1688 PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n); 1689 PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n); 1690 b = bd->v; 1691 PetscCall(MatZeroEntries(C)); 1692 PetscCall(MatDenseGetArray(C,&c)); 1693 switch (bs) { 1694 case 1: 1695 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn)); 1696 break; 1697 case 2: 1698 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn)); 1699 break; 1700 case 3: 1701 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn)); 1702 break; 1703 case 4: 1704 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn)); 1705 break; 1706 case 5: 1707 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn)); 1708 break; 1709 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1710 PetscCall(PetscBLASIntCast(bs,&bbs)); 1711 PetscCall(PetscBLASIntCast(cn,&bcn)); 1712 PetscCall(PetscBLASIntCast(bm,&bbm)); 1713 PetscCall(PetscBLASIntCast(cm,&bcm)); 1714 idx = a->j; 1715 v = a->a; 1716 mbs = a->mbs; 1717 ii = a->i; 1718 z = c; 1719 for (i=0; i<mbs; i++) { 1720 n = ii[1] - ii[0]; ii++; 1721 for (j=0; j<n; j++) { 1722 if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm)); 1723 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm)); 1724 v += bs2; 1725 } 1726 z += bs; 1727 } 1728 } 1729 PetscCall(MatDenseRestoreArray(C,&c)); 1730 PetscCall(PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn)); 1731 PetscFunctionReturn(0); 1732 } 1733