1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n; ++j) { 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if (!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++) { 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end; l++) { 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) { 63 nidx[isz++] = bcol; 64 if (bcol_max < bcol) bcol_max = bcol; 65 } 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end; l++) { 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table_in,bcol)) { 74 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 96 Zero some ops' to avoid invalid usse */ 97 #undef __FUNCT__ 98 #define __FUNCT__ "MatSeqSBAIJZeroOps_Private" 99 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 100 { 101 PetscErrorCode ierr; 102 103 PetscFunctionBegin; 104 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 105 Bseq->ops->mult = 0; 106 Bseq->ops->multadd = 0; 107 Bseq->ops->multtranspose = 0; 108 Bseq->ops->multtransposeadd = 0; 109 Bseq->ops->lufactor = 0; 110 Bseq->ops->choleskyfactor = 0; 111 Bseq->ops->lufactorsymbolic = 0; 112 Bseq->ops->choleskyfactorsymbolic = 0; 113 Bseq->ops->getinertia = 0; 114 PetscFunctionReturn(0); 115 } 116 117 /* same as MatGetSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */ 118 #undef __FUNCT__ 119 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 120 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 121 { 122 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 123 PetscErrorCode ierr; 124 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 125 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 126 const PetscInt *irow,*icol; 127 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 128 PetscInt *aj = a->j,*ai = a->i; 129 MatScalar *mat_a; 130 Mat C; 131 PetscBool flag; 132 133 PetscFunctionBegin; 134 135 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 136 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 137 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 138 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 139 140 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 141 ssmap = smap; 142 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 143 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 144 /* determine lens of each row */ 145 for (i=0; i<nrows; i++) { 146 kstart = ai[irow[i]]; 147 kend = kstart + a->ilen[irow[i]]; 148 lens[i] = 0; 149 for (k=kstart; k<kend; k++) { 150 if (ssmap[aj[k]]) lens[i]++; 151 } 152 } 153 /* Create and fill new matrix */ 154 if (scall == MAT_REUSE_MATRIX) { 155 c = (Mat_SeqSBAIJ*)((*B)->data); 156 157 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 158 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 159 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 160 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 161 C = *B; 162 } else { 163 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 164 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 165 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 166 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 167 } 168 c = (Mat_SeqSBAIJ*)(C->data); 169 for (i=0; i<nrows; i++) { 170 row = irow[i]; 171 kstart = ai[row]; 172 kend = kstart + a->ilen[row]; 173 mat_i = c->i[i]; 174 mat_j = c->j + mat_i; 175 mat_a = c->a + mat_i*bs2; 176 mat_ilen = c->ilen + i; 177 for (k=kstart; k<kend; k++) { 178 if ((tcol=ssmap[a->j[k]])) { 179 *mat_j++ = tcol - 1; 180 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 181 mat_a += bs2; 182 (*mat_ilen)++; 183 } 184 } 185 } 186 /* sort */ 187 { 188 MatScalar *work; 189 190 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 191 for (i=0; i<nrows; i++) { 192 PetscInt ilen; 193 mat_i = c->i[i]; 194 mat_j = c->j + mat_i; 195 mat_a = c->a + mat_i*bs2; 196 ilen = c->ilen[i]; 197 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 198 } 199 ierr = PetscFree(work);CHKERRQ(ierr); 200 } 201 202 /* Free work space */ 203 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 204 ierr = PetscFree(smap);CHKERRQ(ierr); 205 ierr = PetscFree(lens);CHKERRQ(ierr); 206 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 207 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 208 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 *B = C; 211 PetscFunctionReturn(0); 212 } 213 214 #undef __FUNCT__ 215 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 216 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 217 { 218 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 219 IS is1,is2; 220 PetscErrorCode ierr; 221 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs; 222 const PetscInt *irow,*icol; 223 224 PetscFunctionBegin; 225 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 226 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 227 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 228 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 229 230 /* Verify if the indices corespond to each element in a block 231 and form the IS with compressed IS */ 232 maxmnbs = PetscMax(a->mbs,a->nbs); 233 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 234 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 235 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 236 for (i=0; i<a->mbs; i++) { 237 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 238 } 239 count = 0; 240 for (i=0; i<nrows; i++) { 241 PetscInt j = irow[i] / bs; 242 if ((vary[j]--)==bs) iary[count++] = j; 243 } 244 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 245 246 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 247 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 248 for (i=0; i<a->nbs; i++) { 249 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 250 } 251 count = 0; 252 for (i=0; i<ncols; i++) { 253 PetscInt j = icol[i] / bs; 254 if ((vary[j]--)==bs) iary[count++] = j; 255 } 256 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 257 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 258 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 259 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 260 261 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 262 ierr = ISDestroy(&is1);CHKERRQ(ierr); 263 ierr = ISDestroy(&is2);CHKERRQ(ierr); 264 265 if (isrow != iscol) { 266 PetscBool isequal; 267 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 268 if (!isequal) { 269 ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 270 } 271 } 272 PetscFunctionReturn(0); 273 } 274 275 #undef __FUNCT__ 276 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 277 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 278 { 279 PetscErrorCode ierr; 280 PetscInt i; 281 282 PetscFunctionBegin; 283 if (scall == MAT_INITIAL_MATRIX) { 284 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 285 } 286 287 for (i=0; i<n; i++) { 288 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 289 } 290 PetscFunctionReturn(0); 291 } 292 293 /* -------------------------------------------------------*/ 294 /* Should check that shapes of vectors and matrices match */ 295 /* -------------------------------------------------------*/ 296 297 #undef __FUNCT__ 298 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 299 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 300 { 301 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 302 PetscScalar *z,x1,x2,zero=0.0; 303 const PetscScalar *x,*xb; 304 const MatScalar *v; 305 PetscErrorCode ierr; 306 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 307 const PetscInt *aj=a->j,*ai=a->i,*ib; 308 PetscInt nonzerorow=0; 309 310 PetscFunctionBegin; 311 ierr = VecSet(zz,zero);CHKERRQ(ierr); 312 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 313 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 314 315 v = a->a; 316 xb = x; 317 318 for (i=0; i<mbs; i++) { 319 n = ai[1] - ai[0]; /* length of i_th block row of A */ 320 x1 = xb[0]; x2 = xb[1]; 321 ib = aj + *ai; 322 jmin = 0; 323 nonzerorow += (n>0); 324 if (*ib == i) { /* (diag of A)*x */ 325 z[2*i] += v[0]*x1 + v[2]*x2; 326 z[2*i+1] += v[2]*x1 + v[3]*x2; 327 v += 4; jmin++; 328 } 329 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 330 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 331 for (j=jmin; j<n; j++) { 332 /* (strict lower triangular part of A)*x */ 333 cval = ib[j]*2; 334 z[cval] += v[0]*x1 + v[1]*x2; 335 z[cval+1] += v[2]*x1 + v[3]*x2; 336 /* (strict upper triangular part of A)*x */ 337 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 338 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 339 v += 4; 340 } 341 xb +=2; ai++; 342 } 343 344 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 345 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 346 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 347 PetscFunctionReturn(0); 348 } 349 350 #undef __FUNCT__ 351 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 352 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 353 { 354 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 355 PetscScalar *z,x1,x2,x3,zero=0.0; 356 const PetscScalar *x,*xb; 357 const MatScalar *v; 358 PetscErrorCode ierr; 359 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 360 const PetscInt *aj = a->j,*ai = a->i,*ib; 361 PetscInt nonzerorow=0; 362 363 PetscFunctionBegin; 364 ierr = VecSet(zz,zero);CHKERRQ(ierr); 365 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 366 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 367 368 v = a->a; 369 xb = x; 370 371 for (i=0; i<mbs; i++) { 372 n = ai[1] - ai[0]; /* length of i_th block row of A */ 373 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 374 ib = aj + *ai; 375 jmin = 0; 376 nonzerorow += (n>0); 377 if (*ib == i) { /* (diag of A)*x */ 378 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 379 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 380 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 381 v += 9; jmin++; 382 } 383 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 384 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 385 for (j=jmin; j<n; j++) { 386 /* (strict lower triangular part of A)*x */ 387 cval = ib[j]*3; 388 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 389 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 390 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 391 /* (strict upper triangular part of A)*x */ 392 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 393 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 394 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 395 v += 9; 396 } 397 xb +=3; ai++; 398 } 399 400 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 401 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 402 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 403 PetscFunctionReturn(0); 404 } 405 406 #undef __FUNCT__ 407 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 408 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 409 { 410 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 411 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 412 const PetscScalar *x,*xb; 413 const MatScalar *v; 414 PetscErrorCode ierr; 415 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 416 const PetscInt *aj = a->j,*ai = a->i,*ib; 417 PetscInt nonzerorow = 0; 418 419 PetscFunctionBegin; 420 ierr = VecSet(zz,zero);CHKERRQ(ierr); 421 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 422 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 423 424 v = a->a; 425 xb = x; 426 427 for (i=0; i<mbs; i++) { 428 n = ai[1] - ai[0]; /* length of i_th block row of A */ 429 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 430 ib = aj + *ai; 431 jmin = 0; 432 nonzerorow += (n>0); 433 if (*ib == i) { /* (diag of A)*x */ 434 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 435 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 436 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 437 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 438 v += 16; jmin++; 439 } 440 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 441 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 442 for (j=jmin; j<n; j++) { 443 /* (strict lower triangular part of A)*x */ 444 cval = ib[j]*4; 445 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 446 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 447 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 448 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 449 /* (strict upper triangular part of A)*x */ 450 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 451 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 452 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 453 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 454 v += 16; 455 } 456 xb +=4; ai++; 457 } 458 459 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 460 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 461 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 462 PetscFunctionReturn(0); 463 } 464 465 #undef __FUNCT__ 466 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 467 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 468 { 469 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 470 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 471 const PetscScalar *x,*xb; 472 const MatScalar *v; 473 PetscErrorCode ierr; 474 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 475 const PetscInt *aj = a->j,*ai = a->i,*ib; 476 PetscInt nonzerorow=0; 477 478 PetscFunctionBegin; 479 ierr = VecSet(zz,zero);CHKERRQ(ierr); 480 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 481 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 482 483 v = a->a; 484 xb = x; 485 486 for (i=0; i<mbs; i++) { 487 n = ai[1] - ai[0]; /* length of i_th block row of A */ 488 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 489 ib = aj + *ai; 490 jmin = 0; 491 nonzerorow += (n>0); 492 if (*ib == i) { /* (diag of A)*x */ 493 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 494 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 495 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 496 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 497 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 498 v += 25; jmin++; 499 } 500 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 501 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 502 for (j=jmin; j<n; j++) { 503 /* (strict lower triangular part of A)*x */ 504 cval = ib[j]*5; 505 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 506 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 507 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 508 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 509 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 510 /* (strict upper triangular part of A)*x */ 511 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 512 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 513 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 514 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 515 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 516 v += 25; 517 } 518 xb +=5; ai++; 519 } 520 521 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 522 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 523 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 528 #undef __FUNCT__ 529 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 530 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 531 { 532 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 533 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 534 const PetscScalar *x,*xb; 535 const MatScalar *v; 536 PetscErrorCode ierr; 537 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 538 const PetscInt *aj=a->j,*ai=a->i,*ib; 539 PetscInt nonzerorow=0; 540 541 PetscFunctionBegin; 542 ierr = VecSet(zz,zero);CHKERRQ(ierr); 543 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 544 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 545 546 v = a->a; 547 xb = x; 548 549 for (i=0; i<mbs; i++) { 550 n = ai[1] - ai[0]; /* length of i_th block row of A */ 551 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 552 ib = aj + *ai; 553 jmin = 0; 554 nonzerorow += (n>0); 555 if (*ib == i) { /* (diag of A)*x */ 556 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 557 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 558 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 559 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 560 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 561 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 562 v += 36; jmin++; 563 } 564 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 565 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 566 for (j=jmin; j<n; j++) { 567 /* (strict lower triangular part of A)*x */ 568 cval = ib[j]*6; 569 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 570 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 571 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 572 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 573 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 574 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 575 /* (strict upper triangular part of A)*x */ 576 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 577 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 578 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 579 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 580 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 581 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 582 v += 36; 583 } 584 xb +=6; ai++; 585 } 586 587 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 588 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 589 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 590 PetscFunctionReturn(0); 591 } 592 #undef __FUNCT__ 593 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 594 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 595 { 596 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 597 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 598 const PetscScalar *x,*xb; 599 const MatScalar *v; 600 PetscErrorCode ierr; 601 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 602 const PetscInt *aj=a->j,*ai=a->i,*ib; 603 PetscInt nonzerorow=0; 604 605 PetscFunctionBegin; 606 ierr = VecSet(zz,zero);CHKERRQ(ierr); 607 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 608 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 609 610 v = a->a; 611 xb = x; 612 613 for (i=0; i<mbs; i++) { 614 n = ai[1] - ai[0]; /* length of i_th block row of A */ 615 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 616 ib = aj + *ai; 617 jmin = 0; 618 nonzerorow += (n>0); 619 if (*ib == i) { /* (diag of A)*x */ 620 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 621 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 622 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 623 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 624 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 625 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 626 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 627 v += 49; jmin++; 628 } 629 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 630 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 631 for (j=jmin; j<n; j++) { 632 /* (strict lower triangular part of A)*x */ 633 cval = ib[j]*7; 634 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 635 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 636 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 637 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 638 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 639 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 640 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 641 /* (strict upper triangular part of A)*x */ 642 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 643 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 644 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 645 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 646 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 647 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 648 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 649 v += 49; 650 } 651 xb +=7; ai++; 652 } 653 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 654 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 655 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 656 PetscFunctionReturn(0); 657 } 658 659 /* 660 This will not work with MatScalar == float because it calls the BLAS 661 */ 662 #undef __FUNCT__ 663 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 664 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 665 { 666 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 667 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 668 const PetscScalar *x,*x_ptr,*xb; 669 const MatScalar *v; 670 PetscErrorCode ierr; 671 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 672 const PetscInt *idx,*aj,*ii; 673 PetscInt nonzerorow=0; 674 675 PetscFunctionBegin; 676 ierr = VecSet(zz,zero);CHKERRQ(ierr); 677 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 678 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 679 680 aj = a->j; 681 v = a->a; 682 ii = a->i; 683 684 if (!a->mult_work) { 685 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 686 } 687 work = a->mult_work; 688 689 for (i=0; i<mbs; i++) { 690 n = ii[1] - ii[0]; ncols = n*bs; 691 workt = work; idx=aj+ii[0]; 692 nonzerorow += (n>0); 693 694 /* upper triangular part */ 695 for (j=0; j<n; j++) { 696 xb = x_ptr + bs*(*idx++); 697 for (k=0; k<bs; k++) workt[k] = xb[k]; 698 workt += bs; 699 } 700 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 701 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 702 703 /* strict lower triangular part */ 704 idx = aj+ii[0]; 705 if (*idx == i) { 706 ncols -= bs; v += bs2; idx++; n--; 707 } 708 709 if (ncols > 0) { 710 workt = work; 711 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 712 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 713 for (j=0; j<n; j++) { 714 zb = z_ptr + bs*(*idx++); 715 for (k=0; k<bs; k++) zb[k] += workt[k]; 716 workt += bs; 717 } 718 } 719 x += bs; v += n*bs2; z += bs; ii++; 720 } 721 722 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 723 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 724 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 725 PetscFunctionReturn(0); 726 } 727 728 #undef __FUNCT__ 729 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 730 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 731 { 732 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 733 PetscScalar *z,x1; 734 const PetscScalar *x,*xb; 735 const MatScalar *v; 736 PetscErrorCode ierr; 737 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 738 const PetscInt *aj=a->j,*ai=a->i,*ib; 739 PetscInt nonzerorow=0; 740 741 PetscFunctionBegin; 742 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 743 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 744 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 745 v = a->a; 746 xb = x; 747 748 for (i=0; i<mbs; i++) { 749 n = ai[1] - ai[0]; /* length of i_th row of A */ 750 x1 = xb[0]; 751 ib = aj + *ai; 752 jmin = 0; 753 nonzerorow += (n>0); 754 if (*ib == i) { /* (diag of A)*x */ 755 z[i] += *v++ * x[*ib++]; jmin++; 756 } 757 for (j=jmin; j<n; j++) { 758 cval = *ib; 759 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 760 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 761 } 762 xb++; ai++; 763 } 764 765 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 766 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 767 768 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 769 PetscFunctionReturn(0); 770 } 771 772 #undef __FUNCT__ 773 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 774 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 775 { 776 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 777 PetscScalar *z,x1,x2; 778 const PetscScalar *x,*xb; 779 const MatScalar *v; 780 PetscErrorCode ierr; 781 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 782 const PetscInt *aj=a->j,*ai=a->i,*ib; 783 PetscInt nonzerorow=0; 784 785 PetscFunctionBegin; 786 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 787 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 788 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 789 790 v = a->a; 791 xb = x; 792 793 for (i=0; i<mbs; i++) { 794 n = ai[1] - ai[0]; /* length of i_th block row of A */ 795 x1 = xb[0]; x2 = xb[1]; 796 ib = aj + *ai; 797 jmin = 0; 798 nonzerorow += (n>0); 799 if (*ib == i) { /* (diag of A)*x */ 800 z[2*i] += v[0]*x1 + v[2]*x2; 801 z[2*i+1] += v[2]*x1 + v[3]*x2; 802 v += 4; jmin++; 803 } 804 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 805 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 806 for (j=jmin; j<n; j++) { 807 /* (strict lower triangular part of A)*x */ 808 cval = ib[j]*2; 809 z[cval] += v[0]*x1 + v[1]*x2; 810 z[cval+1] += v[2]*x1 + v[3]*x2; 811 /* (strict upper triangular part of A)*x */ 812 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 813 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 814 v += 4; 815 } 816 xb +=2; ai++; 817 } 818 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 819 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 820 821 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 822 PetscFunctionReturn(0); 823 } 824 825 #undef __FUNCT__ 826 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 827 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 828 { 829 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 830 PetscScalar *z,x1,x2,x3; 831 const PetscScalar *x,*xb; 832 const MatScalar *v; 833 PetscErrorCode ierr; 834 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 835 const PetscInt *aj=a->j,*ai=a->i,*ib; 836 PetscInt nonzerorow=0; 837 838 PetscFunctionBegin; 839 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 840 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 841 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 842 843 v = a->a; 844 xb = x; 845 846 for (i=0; i<mbs; i++) { 847 n = ai[1] - ai[0]; /* length of i_th block row of A */ 848 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 849 ib = aj + *ai; 850 jmin = 0; 851 nonzerorow += (n>0); 852 if (*ib == i) { /* (diag of A)*x */ 853 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 854 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 855 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 856 v += 9; jmin++; 857 } 858 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 859 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 860 for (j=jmin; j<n; j++) { 861 /* (strict lower triangular part of A)*x */ 862 cval = ib[j]*3; 863 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 864 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 865 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 866 /* (strict upper triangular part of A)*x */ 867 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 868 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 869 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 870 v += 9; 871 } 872 xb +=3; ai++; 873 } 874 875 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 876 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 877 878 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 879 PetscFunctionReturn(0); 880 } 881 882 #undef __FUNCT__ 883 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 884 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 885 { 886 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 887 PetscScalar *z,x1,x2,x3,x4; 888 const PetscScalar *x,*xb; 889 const MatScalar *v; 890 PetscErrorCode ierr; 891 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 892 const PetscInt *aj=a->j,*ai=a->i,*ib; 893 PetscInt nonzerorow=0; 894 895 PetscFunctionBegin; 896 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 897 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 898 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 899 900 v = a->a; 901 xb = x; 902 903 for (i=0; i<mbs; i++) { 904 n = ai[1] - ai[0]; /* length of i_th block row of A */ 905 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 906 ib = aj + *ai; 907 jmin = 0; 908 nonzerorow += (n>0); 909 if (*ib == i) { /* (diag of A)*x */ 910 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 911 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 912 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 913 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 914 v += 16; jmin++; 915 } 916 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 917 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 918 for (j=jmin; j<n; j++) { 919 /* (strict lower triangular part of A)*x */ 920 cval = ib[j]*4; 921 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 922 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 923 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 924 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 925 /* (strict upper triangular part of A)*x */ 926 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 927 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 928 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 929 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 930 v += 16; 931 } 932 xb +=4; ai++; 933 } 934 935 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 936 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 937 938 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 939 PetscFunctionReturn(0); 940 } 941 942 #undef __FUNCT__ 943 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 944 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 945 { 946 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 947 PetscScalar *z,x1,x2,x3,x4,x5; 948 const PetscScalar *x,*xb; 949 const MatScalar *v; 950 PetscErrorCode ierr; 951 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 952 const PetscInt *aj=a->j,*ai=a->i,*ib; 953 PetscInt nonzerorow=0; 954 955 PetscFunctionBegin; 956 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 957 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 958 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 959 960 v = a->a; 961 xb = x; 962 963 for (i=0; i<mbs; i++) { 964 n = ai[1] - ai[0]; /* length of i_th block row of A */ 965 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 966 ib = aj + *ai; 967 jmin = 0; 968 nonzerorow += (n>0); 969 if (*ib == i) { /* (diag of A)*x */ 970 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 971 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 972 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 973 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 974 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 975 v += 25; jmin++; 976 } 977 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 978 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 979 for (j=jmin; j<n; j++) { 980 /* (strict lower triangular part of A)*x */ 981 cval = ib[j]*5; 982 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 983 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 984 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 985 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 986 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 987 /* (strict upper triangular part of A)*x */ 988 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 989 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 990 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 991 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 992 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 993 v += 25; 994 } 995 xb +=5; ai++; 996 } 997 998 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 999 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1000 1001 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 #undef __FUNCT__ 1005 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 1006 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1007 { 1008 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1009 PetscScalar *z,x1,x2,x3,x4,x5,x6; 1010 const PetscScalar *x,*xb; 1011 const MatScalar *v; 1012 PetscErrorCode ierr; 1013 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1014 const PetscInt *aj=a->j,*ai=a->i,*ib; 1015 PetscInt nonzerorow=0; 1016 1017 PetscFunctionBegin; 1018 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1019 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1020 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1021 1022 v = a->a; 1023 xb = x; 1024 1025 for (i=0; i<mbs; i++) { 1026 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1027 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1028 ib = aj + *ai; 1029 jmin = 0; 1030 nonzerorow += (n>0); 1031 if (*ib == i) { /* (diag of A)*x */ 1032 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1033 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1034 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1035 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1036 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1037 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1038 v += 36; jmin++; 1039 } 1040 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1041 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1042 for (j=jmin; j<n; j++) { 1043 /* (strict lower triangular part of A)*x */ 1044 cval = ib[j]*6; 1045 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1046 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1047 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1048 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1049 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1050 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1051 /* (strict upper triangular part of A)*x */ 1052 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1053 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1054 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1055 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1056 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1057 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1058 v += 36; 1059 } 1060 xb +=6; ai++; 1061 } 1062 1063 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1064 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1065 1066 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1067 PetscFunctionReturn(0); 1068 } 1069 1070 #undef __FUNCT__ 1071 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1072 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1073 { 1074 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1075 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1076 const PetscScalar *x,*xb; 1077 const MatScalar *v; 1078 PetscErrorCode ierr; 1079 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1080 const PetscInt *aj=a->j,*ai=a->i,*ib; 1081 PetscInt nonzerorow=0; 1082 1083 PetscFunctionBegin; 1084 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1085 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1086 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1087 1088 v = a->a; 1089 xb = x; 1090 1091 for (i=0; i<mbs; i++) { 1092 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1093 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1094 ib = aj + *ai; 1095 jmin = 0; 1096 nonzerorow += (n>0); 1097 if (*ib == i) { /* (diag of A)*x */ 1098 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1099 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1100 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1101 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1102 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1103 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1104 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1105 v += 49; jmin++; 1106 } 1107 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1108 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1109 for (j=jmin; j<n; j++) { 1110 /* (strict lower triangular part of A)*x */ 1111 cval = ib[j]*7; 1112 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1113 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1114 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1115 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1116 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1117 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1118 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1119 /* (strict upper triangular part of A)*x */ 1120 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1121 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1122 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1123 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1124 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1125 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1126 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1127 v += 49; 1128 } 1129 xb +=7; ai++; 1130 } 1131 1132 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1133 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1134 1135 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1136 PetscFunctionReturn(0); 1137 } 1138 1139 #undef __FUNCT__ 1140 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1141 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1142 { 1143 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1144 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1145 const PetscScalar *x,*x_ptr,*xb; 1146 const MatScalar *v; 1147 PetscErrorCode ierr; 1148 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1149 const PetscInt *idx,*aj,*ii; 1150 PetscInt nonzerorow=0; 1151 1152 PetscFunctionBegin; 1153 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1154 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1155 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1156 1157 aj = a->j; 1158 v = a->a; 1159 ii = a->i; 1160 1161 if (!a->mult_work) { 1162 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1163 } 1164 work = a->mult_work; 1165 1166 1167 for (i=0; i<mbs; i++) { 1168 n = ii[1] - ii[0]; ncols = n*bs; 1169 workt = work; idx=aj+ii[0]; 1170 nonzerorow += (n>0); 1171 1172 /* upper triangular part */ 1173 for (j=0; j<n; j++) { 1174 xb = x_ptr + bs*(*idx++); 1175 for (k=0; k<bs; k++) workt[k] = xb[k]; 1176 workt += bs; 1177 } 1178 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1179 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1180 1181 /* strict lower triangular part */ 1182 idx = aj+ii[0]; 1183 if (*idx == i) { 1184 ncols -= bs; v += bs2; idx++; n--; 1185 } 1186 if (ncols > 0) { 1187 workt = work; 1188 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1189 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1190 for (j=0; j<n; j++) { 1191 zb = z_ptr + bs*(*idx++); 1192 for (k=0; k<bs; k++) zb[k] += workt[k]; 1193 workt += bs; 1194 } 1195 } 1196 1197 x += bs; v += n*bs2; z += bs; ii++; 1198 } 1199 1200 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1201 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1202 1203 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1204 PetscFunctionReturn(0); 1205 } 1206 1207 #undef __FUNCT__ 1208 #define __FUNCT__ "MatScale_SeqSBAIJ" 1209 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1210 { 1211 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1212 PetscScalar oalpha = alpha; 1213 PetscErrorCode ierr; 1214 PetscBLASInt one = 1,totalnz; 1215 1216 PetscFunctionBegin; 1217 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1218 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1219 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1220 PetscFunctionReturn(0); 1221 } 1222 1223 #undef __FUNCT__ 1224 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1225 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1226 { 1227 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1228 const MatScalar *v = a->a; 1229 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1230 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1231 PetscErrorCode ierr; 1232 const PetscInt *aj=a->j,*col; 1233 1234 PetscFunctionBegin; 1235 if (type == NORM_FROBENIUS) { 1236 for (k=0; k<mbs; k++) { 1237 jmin = a->i[k]; jmax = a->i[k+1]; 1238 col = aj + jmin; 1239 if (*col == k) { /* diagonal block */ 1240 for (i=0; i<bs2; i++) { 1241 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1242 } 1243 jmin++; 1244 } 1245 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1246 for (i=0; i<bs2; i++) { 1247 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1248 } 1249 } 1250 } 1251 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1252 ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 1253 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1254 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1255 for (i=0; i<mbs; i++) jl[i] = mbs; 1256 il[0] = 0; 1257 1258 *norm = 0.0; 1259 for (k=0; k<mbs; k++) { /* k_th block row */ 1260 for (j=0; j<bs; j++) sum[j]=0.0; 1261 /*-- col sum --*/ 1262 i = jl[k]; /* first |A(i,k)| to be added */ 1263 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1264 at step k */ 1265 while (i<mbs) { 1266 nexti = jl[i]; /* next block row to be added */ 1267 ik = il[i]; /* block index of A(i,k) in the array a */ 1268 for (j=0; j<bs; j++) { 1269 v = a->a + ik*bs2 + j*bs; 1270 for (k1=0; k1<bs; k1++) { 1271 sum[j] += PetscAbsScalar(*v); v++; 1272 } 1273 } 1274 /* update il, jl */ 1275 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1276 jmax = a->i[i+1]; 1277 if (jmin < jmax) { 1278 il[i] = jmin; 1279 j = a->j[jmin]; 1280 jl[i] = jl[j]; jl[j]=i; 1281 } 1282 i = nexti; 1283 } 1284 /*-- row sum --*/ 1285 jmin = a->i[k]; jmax = a->i[k+1]; 1286 for (i=jmin; i<jmax; i++) { 1287 for (j=0; j<bs; j++) { 1288 v = a->a + i*bs2 + j; 1289 for (k1=0; k1<bs; k1++) { 1290 sum[j] += PetscAbsScalar(*v); v += bs; 1291 } 1292 } 1293 } 1294 /* add k_th block row to il, jl */ 1295 col = aj+jmin; 1296 if (*col == k) jmin++; 1297 if (jmin < jmax) { 1298 il[k] = jmin; 1299 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1300 } 1301 for (j=0; j<bs; j++) { 1302 if (sum[j] > *norm) *norm = sum[j]; 1303 } 1304 } 1305 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1306 ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1307 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1308 PetscFunctionReturn(0); 1309 } 1310 1311 #undef __FUNCT__ 1312 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1313 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1314 { 1315 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1316 PetscErrorCode ierr; 1317 1318 PetscFunctionBegin; 1319 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1320 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1321 *flg = PETSC_FALSE; 1322 PetscFunctionReturn(0); 1323 } 1324 1325 /* if the a->i are the same */ 1326 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1327 if (!*flg) PetscFunctionReturn(0); 1328 1329 /* if a->j are the same */ 1330 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1331 if (!*flg) PetscFunctionReturn(0); 1332 1333 /* if a->a are the same */ 1334 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1335 PetscFunctionReturn(0); 1336 } 1337 1338 #undef __FUNCT__ 1339 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1340 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1341 { 1342 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1343 PetscErrorCode ierr; 1344 PetscInt i,j,k,row,bs,ambs,bs2; 1345 const PetscInt *ai,*aj; 1346 PetscScalar *x,zero = 0.0; 1347 const MatScalar *aa,*aa_j; 1348 1349 PetscFunctionBegin; 1350 bs = A->rmap->bs; 1351 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1352 1353 aa = a->a; 1354 ambs = a->mbs; 1355 1356 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1357 PetscInt *diag=a->diag; 1358 aa = a->a; 1359 ambs = a->mbs; 1360 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1361 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1362 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1363 PetscFunctionReturn(0); 1364 } 1365 1366 ai = a->i; 1367 aj = a->j; 1368 bs2 = a->bs2; 1369 ierr = VecSet(v,zero);CHKERRQ(ierr); 1370 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1371 for (i=0; i<ambs; i++) { 1372 j=ai[i]; 1373 if (aj[j] == i) { /* if this is a diagonal element */ 1374 row = i*bs; 1375 aa_j = aa + j*bs2; 1376 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1377 } 1378 } 1379 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1380 PetscFunctionReturn(0); 1381 } 1382 1383 #undef __FUNCT__ 1384 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1385 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1386 { 1387 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1388 PetscScalar x; 1389 const PetscScalar *l,*li,*ri; 1390 MatScalar *aa,*v; 1391 PetscErrorCode ierr; 1392 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1393 PetscBool flg; 1394 1395 PetscFunctionBegin; 1396 if (ll != rr) { 1397 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1398 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1399 } 1400 if (!ll) PetscFunctionReturn(0); 1401 ai = a->i; 1402 aj = a->j; 1403 aa = a->a; 1404 m = A->rmap->N; 1405 bs = A->rmap->bs; 1406 mbs = a->mbs; 1407 bs2 = a->bs2; 1408 1409 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1410 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1411 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1412 for (i=0; i<mbs; i++) { /* for each block row */ 1413 M = ai[i+1] - ai[i]; 1414 li = l + i*bs; 1415 v = aa + bs2*ai[i]; 1416 for (j=0; j<M; j++) { /* for each block */ 1417 ri = l + bs*aj[ai[i]+j]; 1418 for (k=0; k<bs; k++) { 1419 x = ri[k]; 1420 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1421 } 1422 } 1423 } 1424 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1425 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1426 PetscFunctionReturn(0); 1427 } 1428 1429 #undef __FUNCT__ 1430 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1431 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1432 { 1433 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1434 1435 PetscFunctionBegin; 1436 info->block_size = a->bs2; 1437 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1438 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1439 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1440 info->assemblies = A->num_ass; 1441 info->mallocs = A->info.mallocs; 1442 info->memory = ((PetscObject)A)->mem; 1443 if (A->factortype) { 1444 info->fill_ratio_given = A->info.fill_ratio_given; 1445 info->fill_ratio_needed = A->info.fill_ratio_needed; 1446 info->factor_mallocs = A->info.factor_mallocs; 1447 } else { 1448 info->fill_ratio_given = 0; 1449 info->fill_ratio_needed = 0; 1450 info->factor_mallocs = 0; 1451 } 1452 PetscFunctionReturn(0); 1453 } 1454 1455 1456 #undef __FUNCT__ 1457 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1458 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1459 { 1460 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1461 PetscErrorCode ierr; 1462 1463 PetscFunctionBegin; 1464 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1465 PetscFunctionReturn(0); 1466 } 1467 1468 #undef __FUNCT__ 1469 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1470 /* 1471 This code does not work since it only checks the upper triangular part of 1472 the matrix. Hence it is not listed in the function table. 1473 */ 1474 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1475 { 1476 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1477 PetscErrorCode ierr; 1478 PetscInt i,j,n,row,col,bs,mbs; 1479 const PetscInt *ai,*aj; 1480 PetscReal atmp; 1481 const MatScalar *aa; 1482 PetscScalar *x; 1483 PetscInt ncols,brow,bcol,krow,kcol; 1484 1485 PetscFunctionBegin; 1486 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1487 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1488 bs = A->rmap->bs; 1489 aa = a->a; 1490 ai = a->i; 1491 aj = a->j; 1492 mbs = a->mbs; 1493 1494 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1495 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1496 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1497 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1498 for (i=0; i<mbs; i++) { 1499 ncols = ai[1] - ai[0]; ai++; 1500 brow = bs*i; 1501 for (j=0; j<ncols; j++) { 1502 bcol = bs*(*aj); 1503 for (kcol=0; kcol<bs; kcol++) { 1504 col = bcol + kcol; /* col index */ 1505 for (krow=0; krow<bs; krow++) { 1506 atmp = PetscAbsScalar(*aa); aa++; 1507 row = brow + krow; /* row index */ 1508 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1509 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1510 } 1511 } 1512 aj++; 1513 } 1514 } 1515 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1516 PetscFunctionReturn(0); 1517 } 1518