1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n; ++j) { 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if (!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++) { 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end; l++) { 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) { 63 nidx[isz++] = bcol; 64 if (bcol_max < bcol) bcol_max = bcol; 65 } 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end; l++) { 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table_in,bcol)) { 74 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B) 98 { 99 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 100 PetscErrorCode ierr; 101 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 102 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 103 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 104 const PetscInt *irow,*aj = a->j,*ai = a->i; 105 MatScalar *mat_a; 106 Mat C; 107 PetscBool flag,sorted; 108 109 PetscFunctionBegin; 110 ierr = ISSorted(isrow,&sorted);CHKERRQ(ierr); 111 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 ierr = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr); 117 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 120 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 121 /* determine lens of each row */ 122 for (i=0; i<nrows; i++) { 123 kstart = ai[irow[i]]; 124 kend = kstart + a->ilen[irow[i]]; 125 lens[i] = 0; 126 for (k=kstart; k<kend; k++) { 127 if (ssmap[aj[k]]) lens[i]++; 128 } 129 } 130 /* Create and fill new matrix */ 131 if (scall == MAT_REUSE_MATRIX) { 132 c = (Mat_SeqSBAIJ*)((*B)->data); 133 134 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 135 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 136 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 137 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 138 C = *B; 139 } else { 140 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 141 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 142 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 143 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 144 } 145 c = (Mat_SeqSBAIJ*)(C->data); 146 for (i=0; i<nrows; i++) { 147 row = irow[i]; 148 kstart = ai[row]; 149 kend = kstart + a->ilen[row]; 150 mat_i = c->i[i]; 151 mat_j = c->j + mat_i; 152 mat_a = c->a + mat_i*bs2; 153 mat_ilen = c->ilen + i; 154 for (k=kstart; k<kend; k++) { 155 if ((tcol=ssmap[a->j[k]])) { 156 *mat_j++ = tcol - 1; 157 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 158 mat_a += bs2; 159 (*mat_ilen)++; 160 } 161 } 162 } 163 164 /* Free work space */ 165 ierr = PetscFree(smap);CHKERRQ(ierr); 166 ierr = PetscFree(lens);CHKERRQ(ierr); 167 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 168 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 169 170 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 171 *B = C; 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 177 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 178 { 179 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 180 IS is1; 181 PetscErrorCode ierr; 182 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 183 const PetscInt *irow; 184 185 PetscFunctionBegin; 186 if (isrow != iscol) { 187 PetscBool isequal; 188 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 189 if (!isequal) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); 190 } 191 192 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 193 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 194 195 /* Verify if the indices corespond to each element in a block 196 and form the IS with compressed IS */ 197 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 198 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 199 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 200 201 count = 0; 202 for (i=0; i<a->mbs; i++) { 203 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 204 if (vary[i]==bs) iary[count++] = i; 205 } 206 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 208 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);CHKERRQ(ierr); 211 ierr = ISDestroy(&is1);CHKERRQ(ierr); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 PetscErrorCode ierr; 220 PetscInt i; 221 PetscBool flg; 222 223 PetscFunctionBegin; 224 ierr = MatGetSubMatrices_SeqBAIJ(A,n,irow,icol,scall,B);CHKERRQ(ierr); 225 for (i=0; i<n; i++) { 226 ierr = ISEqual(irow[i],icol[i],&flg);CHKERRQ(ierr); 227 if (!flg) { 228 Mat Bseq = *B[i]; 229 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 230 Zero some ops' to avoid invalid usse */ 231 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 232 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 233 Bseq->ops->mult = 0; 234 Bseq->ops->multadd = 0; 235 Bseq->ops->multtranspose = 0; 236 Bseq->ops->multtransposeadd = 0; 237 Bseq->ops->lufactor = 0; 238 Bseq->ops->choleskyfactor = 0; 239 Bseq->ops->lufactorsymbolic = 0; 240 Bseq->ops->choleskyfactorsymbolic = 0; 241 Bseq->ops->getinertia = 0; 242 } 243 } 244 PetscFunctionReturn(0); 245 } 246 247 /* -------------------------------------------------------*/ 248 /* Should check that shapes of vectors and matrices match */ 249 /* -------------------------------------------------------*/ 250 251 #undef __FUNCT__ 252 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 253 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 254 { 255 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 256 PetscScalar *z,x1,x2,zero=0.0; 257 const PetscScalar *x,*xb; 258 const MatScalar *v; 259 PetscErrorCode ierr; 260 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 261 const PetscInt *aj=a->j,*ai=a->i,*ib; 262 PetscInt nonzerorow=0; 263 264 PetscFunctionBegin; 265 ierr = VecSet(zz,zero);CHKERRQ(ierr); 266 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 267 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 268 269 v = a->a; 270 xb = x; 271 272 for (i=0; i<mbs; i++) { 273 n = ai[1] - ai[0]; /* length of i_th block row of A */ 274 x1 = xb[0]; x2 = xb[1]; 275 ib = aj + *ai; 276 jmin = 0; 277 nonzerorow += (n>0); 278 if (*ib == i) { /* (diag of A)*x */ 279 z[2*i] += v[0]*x1 + v[2]*x2; 280 z[2*i+1] += v[2]*x1 + v[3]*x2; 281 v += 4; jmin++; 282 } 283 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 284 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 285 for (j=jmin; j<n; j++) { 286 /* (strict lower triangular part of A)*x */ 287 cval = ib[j]*2; 288 z[cval] += v[0]*x1 + v[1]*x2; 289 z[cval+1] += v[2]*x1 + v[3]*x2; 290 /* (strict upper triangular part of A)*x */ 291 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 292 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 293 v += 4; 294 } 295 xb +=2; ai++; 296 } 297 298 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 299 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 300 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 301 PetscFunctionReturn(0); 302 } 303 304 #undef __FUNCT__ 305 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 306 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 307 { 308 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 309 PetscScalar *z,x1,x2,x3,zero=0.0; 310 const PetscScalar *x,*xb; 311 const MatScalar *v; 312 PetscErrorCode ierr; 313 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 314 const PetscInt *aj = a->j,*ai = a->i,*ib; 315 PetscInt nonzerorow=0; 316 317 PetscFunctionBegin; 318 ierr = VecSet(zz,zero);CHKERRQ(ierr); 319 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 320 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 321 322 v = a->a; 323 xb = x; 324 325 for (i=0; i<mbs; i++) { 326 n = ai[1] - ai[0]; /* length of i_th block row of A */ 327 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 328 ib = aj + *ai; 329 jmin = 0; 330 nonzerorow += (n>0); 331 if (*ib == i) { /* (diag of A)*x */ 332 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 333 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 334 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 335 v += 9; jmin++; 336 } 337 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 338 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 339 for (j=jmin; j<n; j++) { 340 /* (strict lower triangular part of A)*x */ 341 cval = ib[j]*3; 342 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 343 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 344 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 345 /* (strict upper triangular part of A)*x */ 346 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 347 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 348 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 349 v += 9; 350 } 351 xb +=3; ai++; 352 } 353 354 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 355 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 356 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 #undef __FUNCT__ 361 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 362 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 363 { 364 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 365 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 366 const PetscScalar *x,*xb; 367 const MatScalar *v; 368 PetscErrorCode ierr; 369 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 370 const PetscInt *aj = a->j,*ai = a->i,*ib; 371 PetscInt nonzerorow = 0; 372 373 PetscFunctionBegin; 374 ierr = VecSet(zz,zero);CHKERRQ(ierr); 375 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 376 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 377 378 v = a->a; 379 xb = x; 380 381 for (i=0; i<mbs; i++) { 382 n = ai[1] - ai[0]; /* length of i_th block row of A */ 383 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 384 ib = aj + *ai; 385 jmin = 0; 386 nonzerorow += (n>0); 387 if (*ib == i) { /* (diag of A)*x */ 388 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 389 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 390 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 391 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 392 v += 16; jmin++; 393 } 394 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 395 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 396 for (j=jmin; j<n; j++) { 397 /* (strict lower triangular part of A)*x */ 398 cval = ib[j]*4; 399 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 400 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 401 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 402 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 403 /* (strict upper triangular part of A)*x */ 404 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 405 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 406 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 407 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 408 v += 16; 409 } 410 xb +=4; ai++; 411 } 412 413 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 414 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 415 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 416 PetscFunctionReturn(0); 417 } 418 419 #undef __FUNCT__ 420 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 421 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 422 { 423 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 424 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 425 const PetscScalar *x,*xb; 426 const MatScalar *v; 427 PetscErrorCode ierr; 428 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 429 const PetscInt *aj = a->j,*ai = a->i,*ib; 430 PetscInt nonzerorow=0; 431 432 PetscFunctionBegin; 433 ierr = VecSet(zz,zero);CHKERRQ(ierr); 434 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 435 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 436 437 v = a->a; 438 xb = x; 439 440 for (i=0; i<mbs; i++) { 441 n = ai[1] - ai[0]; /* length of i_th block row of A */ 442 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 443 ib = aj + *ai; 444 jmin = 0; 445 nonzerorow += (n>0); 446 if (*ib == i) { /* (diag of A)*x */ 447 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 448 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 449 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 450 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 451 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 452 v += 25; jmin++; 453 } 454 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 455 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 456 for (j=jmin; j<n; j++) { 457 /* (strict lower triangular part of A)*x */ 458 cval = ib[j]*5; 459 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 460 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 461 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 462 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 463 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 464 /* (strict upper triangular part of A)*x */ 465 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 466 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 467 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 468 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 469 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 470 v += 25; 471 } 472 xb +=5; ai++; 473 } 474 475 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 476 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 477 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 478 PetscFunctionReturn(0); 479 } 480 481 482 #undef __FUNCT__ 483 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 484 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 485 { 486 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 487 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 488 const PetscScalar *x,*xb; 489 const MatScalar *v; 490 PetscErrorCode ierr; 491 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 492 const PetscInt *aj=a->j,*ai=a->i,*ib; 493 PetscInt nonzerorow=0; 494 495 PetscFunctionBegin; 496 ierr = VecSet(zz,zero);CHKERRQ(ierr); 497 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 498 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 499 500 v = a->a; 501 xb = x; 502 503 for (i=0; i<mbs; i++) { 504 n = ai[1] - ai[0]; /* length of i_th block row of A */ 505 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 506 ib = aj + *ai; 507 jmin = 0; 508 nonzerorow += (n>0); 509 if (*ib == i) { /* (diag of A)*x */ 510 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 511 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 512 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 513 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 514 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 515 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 516 v += 36; jmin++; 517 } 518 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 519 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 520 for (j=jmin; j<n; j++) { 521 /* (strict lower triangular part of A)*x */ 522 cval = ib[j]*6; 523 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 524 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 525 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 526 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 527 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 528 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 529 /* (strict upper triangular part of A)*x */ 530 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 531 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 532 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 533 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 534 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 535 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 536 v += 36; 537 } 538 xb +=6; ai++; 539 } 540 541 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 542 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 543 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 544 PetscFunctionReturn(0); 545 } 546 #undef __FUNCT__ 547 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 548 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 549 { 550 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 551 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 552 const PetscScalar *x,*xb; 553 const MatScalar *v; 554 PetscErrorCode ierr; 555 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 556 const PetscInt *aj=a->j,*ai=a->i,*ib; 557 PetscInt nonzerorow=0; 558 559 PetscFunctionBegin; 560 ierr = VecSet(zz,zero);CHKERRQ(ierr); 561 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 562 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 563 564 v = a->a; 565 xb = x; 566 567 for (i=0; i<mbs; i++) { 568 n = ai[1] - ai[0]; /* length of i_th block row of A */ 569 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 570 ib = aj + *ai; 571 jmin = 0; 572 nonzerorow += (n>0); 573 if (*ib == i) { /* (diag of A)*x */ 574 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 575 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 576 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 577 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 578 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 579 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 580 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 581 v += 49; jmin++; 582 } 583 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 584 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 585 for (j=jmin; j<n; j++) { 586 /* (strict lower triangular part of A)*x */ 587 cval = ib[j]*7; 588 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 589 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 590 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 591 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 592 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 593 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 594 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 595 /* (strict upper triangular part of A)*x */ 596 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 597 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 598 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 599 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 600 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 601 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 602 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 603 v += 49; 604 } 605 xb +=7; ai++; 606 } 607 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 608 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 609 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 610 PetscFunctionReturn(0); 611 } 612 613 /* 614 This will not work with MatScalar == float because it calls the BLAS 615 */ 616 #undef __FUNCT__ 617 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 618 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 619 { 620 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 621 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 622 const PetscScalar *x,*x_ptr,*xb; 623 const MatScalar *v; 624 PetscErrorCode ierr; 625 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 626 const PetscInt *idx,*aj,*ii; 627 PetscInt nonzerorow=0; 628 629 PetscFunctionBegin; 630 ierr = VecSet(zz,zero);CHKERRQ(ierr); 631 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 632 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 633 634 aj = a->j; 635 v = a->a; 636 ii = a->i; 637 638 if (!a->mult_work) { 639 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 640 } 641 work = a->mult_work; 642 643 for (i=0; i<mbs; i++) { 644 n = ii[1] - ii[0]; ncols = n*bs; 645 workt = work; idx=aj+ii[0]; 646 nonzerorow += (n>0); 647 648 /* upper triangular part */ 649 for (j=0; j<n; j++) { 650 xb = x_ptr + bs*(*idx++); 651 for (k=0; k<bs; k++) workt[k] = xb[k]; 652 workt += bs; 653 } 654 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 655 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 656 657 /* strict lower triangular part */ 658 idx = aj+ii[0]; 659 if (*idx == i) { 660 ncols -= bs; v += bs2; idx++; n--; 661 } 662 663 if (ncols > 0) { 664 workt = work; 665 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 666 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 667 for (j=0; j<n; j++) { 668 zb = z_ptr + bs*(*idx++); 669 for (k=0; k<bs; k++) zb[k] += workt[k]; 670 workt += bs; 671 } 672 } 673 x += bs; v += n*bs2; z += bs; ii++; 674 } 675 676 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 677 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 678 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 679 PetscFunctionReturn(0); 680 } 681 682 #undef __FUNCT__ 683 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 684 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 685 { 686 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 687 PetscScalar *z,x1; 688 const PetscScalar *x,*xb; 689 const MatScalar *v; 690 PetscErrorCode ierr; 691 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 692 const PetscInt *aj=a->j,*ai=a->i,*ib; 693 PetscInt nonzerorow=0; 694 695 PetscFunctionBegin; 696 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 697 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 698 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 699 v = a->a; 700 xb = x; 701 702 for (i=0; i<mbs; i++) { 703 n = ai[1] - ai[0]; /* length of i_th row of A */ 704 x1 = xb[0]; 705 ib = aj + *ai; 706 jmin = 0; 707 nonzerorow += (n>0); 708 if (*ib == i) { /* (diag of A)*x */ 709 z[i] += *v++ * x[*ib++]; jmin++; 710 } 711 for (j=jmin; j<n; j++) { 712 cval = *ib; 713 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 714 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 715 } 716 xb++; ai++; 717 } 718 719 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 720 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 721 722 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 723 PetscFunctionReturn(0); 724 } 725 726 #undef __FUNCT__ 727 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 728 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 729 { 730 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 731 PetscScalar *z,x1,x2; 732 const PetscScalar *x,*xb; 733 const MatScalar *v; 734 PetscErrorCode ierr; 735 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 736 const PetscInt *aj=a->j,*ai=a->i,*ib; 737 PetscInt nonzerorow=0; 738 739 PetscFunctionBegin; 740 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 741 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 742 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 743 744 v = a->a; 745 xb = x; 746 747 for (i=0; i<mbs; i++) { 748 n = ai[1] - ai[0]; /* length of i_th block row of A */ 749 x1 = xb[0]; x2 = xb[1]; 750 ib = aj + *ai; 751 jmin = 0; 752 nonzerorow += (n>0); 753 if (*ib == i) { /* (diag of A)*x */ 754 z[2*i] += v[0]*x1 + v[2]*x2; 755 z[2*i+1] += v[2]*x1 + v[3]*x2; 756 v += 4; jmin++; 757 } 758 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 759 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 760 for (j=jmin; j<n; j++) { 761 /* (strict lower triangular part of A)*x */ 762 cval = ib[j]*2; 763 z[cval] += v[0]*x1 + v[1]*x2; 764 z[cval+1] += v[2]*x1 + v[3]*x2; 765 /* (strict upper triangular part of A)*x */ 766 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 767 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 768 v += 4; 769 } 770 xb +=2; ai++; 771 } 772 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 773 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 774 775 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 776 PetscFunctionReturn(0); 777 } 778 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 781 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 782 { 783 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 784 PetscScalar *z,x1,x2,x3; 785 const PetscScalar *x,*xb; 786 const MatScalar *v; 787 PetscErrorCode ierr; 788 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 789 const PetscInt *aj=a->j,*ai=a->i,*ib; 790 PetscInt nonzerorow=0; 791 792 PetscFunctionBegin; 793 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 794 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 795 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 796 797 v = a->a; 798 xb = x; 799 800 for (i=0; i<mbs; i++) { 801 n = ai[1] - ai[0]; /* length of i_th block row of A */ 802 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 803 ib = aj + *ai; 804 jmin = 0; 805 nonzerorow += (n>0); 806 if (*ib == i) { /* (diag of A)*x */ 807 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 808 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 809 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 810 v += 9; jmin++; 811 } 812 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 813 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 814 for (j=jmin; j<n; j++) { 815 /* (strict lower triangular part of A)*x */ 816 cval = ib[j]*3; 817 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 818 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 819 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 820 /* (strict upper triangular part of A)*x */ 821 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 822 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 823 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 824 v += 9; 825 } 826 xb +=3; ai++; 827 } 828 829 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 830 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 831 832 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 833 PetscFunctionReturn(0); 834 } 835 836 #undef __FUNCT__ 837 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 838 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 839 { 840 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 841 PetscScalar *z,x1,x2,x3,x4; 842 const PetscScalar *x,*xb; 843 const MatScalar *v; 844 PetscErrorCode ierr; 845 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 846 const PetscInt *aj=a->j,*ai=a->i,*ib; 847 PetscInt nonzerorow=0; 848 849 PetscFunctionBegin; 850 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 851 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 852 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 853 854 v = a->a; 855 xb = x; 856 857 for (i=0; i<mbs; i++) { 858 n = ai[1] - ai[0]; /* length of i_th block row of A */ 859 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 860 ib = aj + *ai; 861 jmin = 0; 862 nonzerorow += (n>0); 863 if (*ib == i) { /* (diag of A)*x */ 864 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 865 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 866 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 867 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 868 v += 16; jmin++; 869 } 870 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 871 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 872 for (j=jmin; j<n; j++) { 873 /* (strict lower triangular part of A)*x */ 874 cval = ib[j]*4; 875 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 876 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 877 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 878 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 879 /* (strict upper triangular part of A)*x */ 880 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 881 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 882 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 883 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 884 v += 16; 885 } 886 xb +=4; ai++; 887 } 888 889 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 890 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 891 892 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 893 PetscFunctionReturn(0); 894 } 895 896 #undef __FUNCT__ 897 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 898 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 899 { 900 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 901 PetscScalar *z,x1,x2,x3,x4,x5; 902 const PetscScalar *x,*xb; 903 const MatScalar *v; 904 PetscErrorCode ierr; 905 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 906 const PetscInt *aj=a->j,*ai=a->i,*ib; 907 PetscInt nonzerorow=0; 908 909 PetscFunctionBegin; 910 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 911 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 912 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 913 914 v = a->a; 915 xb = x; 916 917 for (i=0; i<mbs; i++) { 918 n = ai[1] - ai[0]; /* length of i_th block row of A */ 919 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 920 ib = aj + *ai; 921 jmin = 0; 922 nonzerorow += (n>0); 923 if (*ib == i) { /* (diag of A)*x */ 924 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 925 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 926 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 927 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 928 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 929 v += 25; jmin++; 930 } 931 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 932 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 933 for (j=jmin; j<n; j++) { 934 /* (strict lower triangular part of A)*x */ 935 cval = ib[j]*5; 936 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 937 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 938 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 939 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 940 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 941 /* (strict upper triangular part of A)*x */ 942 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 943 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 944 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 945 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 946 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 947 v += 25; 948 } 949 xb +=5; ai++; 950 } 951 952 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 953 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 954 955 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 956 PetscFunctionReturn(0); 957 } 958 #undef __FUNCT__ 959 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 960 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 961 { 962 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 963 PetscScalar *z,x1,x2,x3,x4,x5,x6; 964 const PetscScalar *x,*xb; 965 const MatScalar *v; 966 PetscErrorCode ierr; 967 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 968 const PetscInt *aj=a->j,*ai=a->i,*ib; 969 PetscInt nonzerorow=0; 970 971 PetscFunctionBegin; 972 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 973 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 974 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 975 976 v = a->a; 977 xb = x; 978 979 for (i=0; i<mbs; i++) { 980 n = ai[1] - ai[0]; /* length of i_th block row of A */ 981 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 982 ib = aj + *ai; 983 jmin = 0; 984 nonzerorow += (n>0); 985 if (*ib == i) { /* (diag of A)*x */ 986 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 987 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 988 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 989 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 990 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 991 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 992 v += 36; jmin++; 993 } 994 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 995 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 996 for (j=jmin; j<n; j++) { 997 /* (strict lower triangular part of A)*x */ 998 cval = ib[j]*6; 999 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1000 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1001 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1002 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1003 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1004 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1005 /* (strict upper triangular part of A)*x */ 1006 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1007 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1008 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1009 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1010 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1011 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1012 v += 36; 1013 } 1014 xb +=6; ai++; 1015 } 1016 1017 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1018 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1019 1020 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1021 PetscFunctionReturn(0); 1022 } 1023 1024 #undef __FUNCT__ 1025 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1026 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1027 { 1028 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1029 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1030 const PetscScalar *x,*xb; 1031 const MatScalar *v; 1032 PetscErrorCode ierr; 1033 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1034 const PetscInt *aj=a->j,*ai=a->i,*ib; 1035 PetscInt nonzerorow=0; 1036 1037 PetscFunctionBegin; 1038 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1039 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1040 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1041 1042 v = a->a; 1043 xb = x; 1044 1045 for (i=0; i<mbs; i++) { 1046 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1047 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1048 ib = aj + *ai; 1049 jmin = 0; 1050 nonzerorow += (n>0); 1051 if (*ib == i) { /* (diag of A)*x */ 1052 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1053 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1054 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1055 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1056 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1057 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1058 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1059 v += 49; jmin++; 1060 } 1061 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1062 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1063 for (j=jmin; j<n; j++) { 1064 /* (strict lower triangular part of A)*x */ 1065 cval = ib[j]*7; 1066 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1067 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1068 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1069 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1070 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1071 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1072 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1073 /* (strict upper triangular part of A)*x */ 1074 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1075 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1076 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1077 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1078 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1079 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1080 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1081 v += 49; 1082 } 1083 xb +=7; ai++; 1084 } 1085 1086 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1087 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1088 1089 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1090 PetscFunctionReturn(0); 1091 } 1092 1093 #undef __FUNCT__ 1094 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1095 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1096 { 1097 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1098 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1099 const PetscScalar *x,*x_ptr,*xb; 1100 const MatScalar *v; 1101 PetscErrorCode ierr; 1102 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1103 const PetscInt *idx,*aj,*ii; 1104 PetscInt nonzerorow=0; 1105 1106 PetscFunctionBegin; 1107 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1108 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1109 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1110 1111 aj = a->j; 1112 v = a->a; 1113 ii = a->i; 1114 1115 if (!a->mult_work) { 1116 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1117 } 1118 work = a->mult_work; 1119 1120 1121 for (i=0; i<mbs; i++) { 1122 n = ii[1] - ii[0]; ncols = n*bs; 1123 workt = work; idx=aj+ii[0]; 1124 nonzerorow += (n>0); 1125 1126 /* upper triangular part */ 1127 for (j=0; j<n; j++) { 1128 xb = x_ptr + bs*(*idx++); 1129 for (k=0; k<bs; k++) workt[k] = xb[k]; 1130 workt += bs; 1131 } 1132 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1133 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1134 1135 /* strict lower triangular part */ 1136 idx = aj+ii[0]; 1137 if (*idx == i) { 1138 ncols -= bs; v += bs2; idx++; n--; 1139 } 1140 if (ncols > 0) { 1141 workt = work; 1142 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1143 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1144 for (j=0; j<n; j++) { 1145 zb = z_ptr + bs*(*idx++); 1146 for (k=0; k<bs; k++) zb[k] += workt[k]; 1147 workt += bs; 1148 } 1149 } 1150 1151 x += bs; v += n*bs2; z += bs; ii++; 1152 } 1153 1154 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1155 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1156 1157 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1158 PetscFunctionReturn(0); 1159 } 1160 1161 #undef __FUNCT__ 1162 #define __FUNCT__ "MatScale_SeqSBAIJ" 1163 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1164 { 1165 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1166 PetscScalar oalpha = alpha; 1167 PetscErrorCode ierr; 1168 PetscBLASInt one = 1,totalnz; 1169 1170 PetscFunctionBegin; 1171 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1172 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1173 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 #undef __FUNCT__ 1178 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1179 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1180 { 1181 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1182 const MatScalar *v = a->a; 1183 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1184 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1185 PetscErrorCode ierr; 1186 const PetscInt *aj=a->j,*col; 1187 1188 PetscFunctionBegin; 1189 if (type == NORM_FROBENIUS) { 1190 for (k=0; k<mbs; k++) { 1191 jmin = a->i[k]; jmax = a->i[k+1]; 1192 col = aj + jmin; 1193 if (*col == k) { /* diagonal block */ 1194 for (i=0; i<bs2; i++) { 1195 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1196 } 1197 jmin++; 1198 } 1199 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1200 for (i=0; i<bs2; i++) { 1201 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1202 } 1203 } 1204 } 1205 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1206 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1207 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1208 for (i=0; i<mbs; i++) jl[i] = mbs; 1209 il[0] = 0; 1210 1211 *norm = 0.0; 1212 for (k=0; k<mbs; k++) { /* k_th block row */ 1213 for (j=0; j<bs; j++) sum[j]=0.0; 1214 /*-- col sum --*/ 1215 i = jl[k]; /* first |A(i,k)| to be added */ 1216 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1217 at step k */ 1218 while (i<mbs) { 1219 nexti = jl[i]; /* next block row to be added */ 1220 ik = il[i]; /* block index of A(i,k) in the array a */ 1221 for (j=0; j<bs; j++) { 1222 v = a->a + ik*bs2 + j*bs; 1223 for (k1=0; k1<bs; k1++) { 1224 sum[j] += PetscAbsScalar(*v); v++; 1225 } 1226 } 1227 /* update il, jl */ 1228 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1229 jmax = a->i[i+1]; 1230 if (jmin < jmax) { 1231 il[i] = jmin; 1232 j = a->j[jmin]; 1233 jl[i] = jl[j]; jl[j]=i; 1234 } 1235 i = nexti; 1236 } 1237 /*-- row sum --*/ 1238 jmin = a->i[k]; jmax = a->i[k+1]; 1239 for (i=jmin; i<jmax; i++) { 1240 for (j=0; j<bs; j++) { 1241 v = a->a + i*bs2 + j; 1242 for (k1=0; k1<bs; k1++) { 1243 sum[j] += PetscAbsScalar(*v); v += bs; 1244 } 1245 } 1246 } 1247 /* add k_th block row to il, jl */ 1248 col = aj+jmin; 1249 if (*col == k) jmin++; 1250 if (jmin < jmax) { 1251 il[k] = jmin; 1252 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1253 } 1254 for (j=0; j<bs; j++) { 1255 if (sum[j] > *norm) *norm = sum[j]; 1256 } 1257 } 1258 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1259 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1260 PetscFunctionReturn(0); 1261 } 1262 1263 #undef __FUNCT__ 1264 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1265 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1266 { 1267 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1268 PetscErrorCode ierr; 1269 1270 PetscFunctionBegin; 1271 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1272 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1273 *flg = PETSC_FALSE; 1274 PetscFunctionReturn(0); 1275 } 1276 1277 /* if the a->i are the same */ 1278 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1279 if (!*flg) PetscFunctionReturn(0); 1280 1281 /* if a->j are the same */ 1282 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1283 if (!*flg) PetscFunctionReturn(0); 1284 1285 /* if a->a are the same */ 1286 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1287 PetscFunctionReturn(0); 1288 } 1289 1290 #undef __FUNCT__ 1291 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1292 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1293 { 1294 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1295 PetscErrorCode ierr; 1296 PetscInt i,j,k,row,bs,ambs,bs2; 1297 const PetscInt *ai,*aj; 1298 PetscScalar *x,zero = 0.0; 1299 const MatScalar *aa,*aa_j; 1300 1301 PetscFunctionBegin; 1302 bs = A->rmap->bs; 1303 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1304 1305 aa = a->a; 1306 ambs = a->mbs; 1307 1308 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1309 PetscInt *diag=a->diag; 1310 aa = a->a; 1311 ambs = a->mbs; 1312 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1313 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1314 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1315 PetscFunctionReturn(0); 1316 } 1317 1318 ai = a->i; 1319 aj = a->j; 1320 bs2 = a->bs2; 1321 ierr = VecSet(v,zero);CHKERRQ(ierr); 1322 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1323 for (i=0; i<ambs; i++) { 1324 j=ai[i]; 1325 if (aj[j] == i) { /* if this is a diagonal element */ 1326 row = i*bs; 1327 aa_j = aa + j*bs2; 1328 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1329 } 1330 } 1331 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1332 PetscFunctionReturn(0); 1333 } 1334 1335 #undef __FUNCT__ 1336 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1337 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1338 { 1339 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1340 PetscScalar x; 1341 const PetscScalar *l,*li,*ri; 1342 MatScalar *aa,*v; 1343 PetscErrorCode ierr; 1344 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1345 PetscBool flg; 1346 1347 PetscFunctionBegin; 1348 if (ll != rr) { 1349 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1350 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1351 } 1352 if (!ll) PetscFunctionReturn(0); 1353 ai = a->i; 1354 aj = a->j; 1355 aa = a->a; 1356 m = A->rmap->N; 1357 bs = A->rmap->bs; 1358 mbs = a->mbs; 1359 bs2 = a->bs2; 1360 1361 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1362 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1363 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1364 for (i=0; i<mbs; i++) { /* for each block row */ 1365 M = ai[i+1] - ai[i]; 1366 li = l + i*bs; 1367 v = aa + bs2*ai[i]; 1368 for (j=0; j<M; j++) { /* for each block */ 1369 ri = l + bs*aj[ai[i]+j]; 1370 for (k=0; k<bs; k++) { 1371 x = ri[k]; 1372 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1373 } 1374 } 1375 } 1376 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1377 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1378 PetscFunctionReturn(0); 1379 } 1380 1381 #undef __FUNCT__ 1382 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1383 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1384 { 1385 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1386 1387 PetscFunctionBegin; 1388 info->block_size = a->bs2; 1389 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1390 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1391 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1392 info->assemblies = A->num_ass; 1393 info->mallocs = A->info.mallocs; 1394 info->memory = ((PetscObject)A)->mem; 1395 if (A->factortype) { 1396 info->fill_ratio_given = A->info.fill_ratio_given; 1397 info->fill_ratio_needed = A->info.fill_ratio_needed; 1398 info->factor_mallocs = A->info.factor_mallocs; 1399 } else { 1400 info->fill_ratio_given = 0; 1401 info->fill_ratio_needed = 0; 1402 info->factor_mallocs = 0; 1403 } 1404 PetscFunctionReturn(0); 1405 } 1406 1407 1408 #undef __FUNCT__ 1409 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1410 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1411 { 1412 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1413 PetscErrorCode ierr; 1414 1415 PetscFunctionBegin; 1416 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1417 PetscFunctionReturn(0); 1418 } 1419 1420 #undef __FUNCT__ 1421 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1422 /* 1423 This code does not work since it only checks the upper triangular part of 1424 the matrix. Hence it is not listed in the function table. 1425 */ 1426 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1427 { 1428 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1429 PetscErrorCode ierr; 1430 PetscInt i,j,n,row,col,bs,mbs; 1431 const PetscInt *ai,*aj; 1432 PetscReal atmp; 1433 const MatScalar *aa; 1434 PetscScalar *x; 1435 PetscInt ncols,brow,bcol,krow,kcol; 1436 1437 PetscFunctionBegin; 1438 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1439 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1440 bs = A->rmap->bs; 1441 aa = a->a; 1442 ai = a->i; 1443 aj = a->j; 1444 mbs = a->mbs; 1445 1446 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1447 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1448 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1449 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1450 for (i=0; i<mbs; i++) { 1451 ncols = ai[1] - ai[0]; ai++; 1452 brow = bs*i; 1453 for (j=0; j<ncols; j++) { 1454 bcol = bs*(*aj); 1455 for (kcol=0; kcol<bs; kcol++) { 1456 col = bcol + kcol; /* col index */ 1457 for (krow=0; krow<bs; krow++) { 1458 atmp = PetscAbsScalar(*aa); aa++; 1459 row = brow + krow; /* row index */ 1460 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1461 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1462 } 1463 } 1464 aj++; 1465 } 1466 } 1467 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1468 PetscFunctionReturn(0); 1469 } 1470