1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n; ++j) { 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if (!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++) { 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end; l++) { 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) { 63 nidx[isz++] = bcol; 64 if (bcol_max < bcol) bcol_max = bcol; 65 } 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end; l++) { 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table_in,bcol)) { 74 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc. 96 Zero some ops' to avoid invalid usse */ 97 #undef __FUNCT__ 98 #define __FUNCT__ "MatSeqSBAIJZeroOps_Private" 99 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq) 100 { 101 PetscErrorCode ierr; 102 103 PetscFunctionBegin; 104 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr); 105 Bseq->ops->mult = 0; 106 Bseq->ops->multadd = 0; 107 Bseq->ops->multtranspose = 0; 108 Bseq->ops->multtransposeadd = 0; 109 Bseq->ops->lufactor = 0; 110 Bseq->ops->choleskyfactor = 0; 111 Bseq->ops->lufactorsymbolic = 0; 112 Bseq->ops->choleskyfactorsymbolic = 0; 113 Bseq->ops->getinertia = 0; 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 119 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 120 { 121 PetscErrorCode ierr; 122 123 PetscFunctionBegin; 124 ierr = MatGetSubMatrix_SeqBAIJ(A,isrow,iscol,scall,B);CHKERRQ(ierr); 125 126 if (isrow != iscol) { 127 PetscBool isequal; 128 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 129 if (!isequal) { 130 ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr); 131 } 132 } 133 PetscFunctionReturn(0); 134 } 135 136 #undef __FUNCT__ 137 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 138 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 139 { 140 PetscErrorCode ierr; 141 PetscInt i; 142 PetscBool flg; 143 144 PetscFunctionBegin; 145 ierr = MatGetSubMatrices_SeqBAIJ(A,n,irow,icol,scall,B);CHKERRQ(ierr); 146 for (i=0; i<n; i++) { 147 ierr = ISEqual(irow[i],icol[i],&flg);CHKERRQ(ierr); 148 if (!flg) { 149 ierr = MatSeqSBAIJZeroOps_Private(*B[i]);CHKERRQ(ierr); 150 } 151 } 152 PetscFunctionReturn(0); 153 } 154 155 /* -------------------------------------------------------*/ 156 /* Should check that shapes of vectors and matrices match */ 157 /* -------------------------------------------------------*/ 158 159 #undef __FUNCT__ 160 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 161 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 162 { 163 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 164 PetscScalar *z,x1,x2,zero=0.0; 165 const PetscScalar *x,*xb; 166 const MatScalar *v; 167 PetscErrorCode ierr; 168 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 169 const PetscInt *aj=a->j,*ai=a->i,*ib; 170 PetscInt nonzerorow=0; 171 172 PetscFunctionBegin; 173 ierr = VecSet(zz,zero);CHKERRQ(ierr); 174 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 175 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 176 177 v = a->a; 178 xb = x; 179 180 for (i=0; i<mbs; i++) { 181 n = ai[1] - ai[0]; /* length of i_th block row of A */ 182 x1 = xb[0]; x2 = xb[1]; 183 ib = aj + *ai; 184 jmin = 0; 185 nonzerorow += (n>0); 186 if (*ib == i) { /* (diag of A)*x */ 187 z[2*i] += v[0]*x1 + v[2]*x2; 188 z[2*i+1] += v[2]*x1 + v[3]*x2; 189 v += 4; jmin++; 190 } 191 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 192 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 193 for (j=jmin; j<n; j++) { 194 /* (strict lower triangular part of A)*x */ 195 cval = ib[j]*2; 196 z[cval] += v[0]*x1 + v[1]*x2; 197 z[cval+1] += v[2]*x1 + v[3]*x2; 198 /* (strict upper triangular part of A)*x */ 199 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 200 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 201 v += 4; 202 } 203 xb +=2; ai++; 204 } 205 206 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 207 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 208 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 209 PetscFunctionReturn(0); 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 214 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 215 { 216 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 217 PetscScalar *z,x1,x2,x3,zero=0.0; 218 const PetscScalar *x,*xb; 219 const MatScalar *v; 220 PetscErrorCode ierr; 221 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 222 const PetscInt *aj = a->j,*ai = a->i,*ib; 223 PetscInt nonzerorow=0; 224 225 PetscFunctionBegin; 226 ierr = VecSet(zz,zero);CHKERRQ(ierr); 227 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 228 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 229 230 v = a->a; 231 xb = x; 232 233 for (i=0; i<mbs; i++) { 234 n = ai[1] - ai[0]; /* length of i_th block row of A */ 235 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 236 ib = aj + *ai; 237 jmin = 0; 238 nonzerorow += (n>0); 239 if (*ib == i) { /* (diag of A)*x */ 240 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 241 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 242 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 243 v += 9; jmin++; 244 } 245 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 246 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 247 for (j=jmin; j<n; j++) { 248 /* (strict lower triangular part of A)*x */ 249 cval = ib[j]*3; 250 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 251 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 252 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 253 /* (strict upper triangular part of A)*x */ 254 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 255 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 256 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 257 v += 9; 258 } 259 xb +=3; ai++; 260 } 261 262 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 263 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 264 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 265 PetscFunctionReturn(0); 266 } 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 270 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 271 { 272 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 273 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 274 const PetscScalar *x,*xb; 275 const MatScalar *v; 276 PetscErrorCode ierr; 277 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 278 const PetscInt *aj = a->j,*ai = a->i,*ib; 279 PetscInt nonzerorow = 0; 280 281 PetscFunctionBegin; 282 ierr = VecSet(zz,zero);CHKERRQ(ierr); 283 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 284 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 285 286 v = a->a; 287 xb = x; 288 289 for (i=0; i<mbs; i++) { 290 n = ai[1] - ai[0]; /* length of i_th block row of A */ 291 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 292 ib = aj + *ai; 293 jmin = 0; 294 nonzerorow += (n>0); 295 if (*ib == i) { /* (diag of A)*x */ 296 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 297 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 298 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 299 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 300 v += 16; jmin++; 301 } 302 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 303 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 304 for (j=jmin; j<n; j++) { 305 /* (strict lower triangular part of A)*x */ 306 cval = ib[j]*4; 307 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 308 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 309 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 310 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 311 /* (strict upper triangular part of A)*x */ 312 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 313 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 314 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 315 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 316 v += 16; 317 } 318 xb +=4; ai++; 319 } 320 321 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 322 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 323 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 #undef __FUNCT__ 328 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 329 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 330 { 331 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 332 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 333 const PetscScalar *x,*xb; 334 const MatScalar *v; 335 PetscErrorCode ierr; 336 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 337 const PetscInt *aj = a->j,*ai = a->i,*ib; 338 PetscInt nonzerorow=0; 339 340 PetscFunctionBegin; 341 ierr = VecSet(zz,zero);CHKERRQ(ierr); 342 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 343 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 344 345 v = a->a; 346 xb = x; 347 348 for (i=0; i<mbs; i++) { 349 n = ai[1] - ai[0]; /* length of i_th block row of A */ 350 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 351 ib = aj + *ai; 352 jmin = 0; 353 nonzerorow += (n>0); 354 if (*ib == i) { /* (diag of A)*x */ 355 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 356 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 357 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 358 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 359 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 360 v += 25; jmin++; 361 } 362 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 363 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 364 for (j=jmin; j<n; j++) { 365 /* (strict lower triangular part of A)*x */ 366 cval = ib[j]*5; 367 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 368 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 369 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 370 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 371 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 372 /* (strict upper triangular part of A)*x */ 373 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 374 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 375 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 376 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 377 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 378 v += 25; 379 } 380 xb +=5; ai++; 381 } 382 383 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 384 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 385 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 386 PetscFunctionReturn(0); 387 } 388 389 390 #undef __FUNCT__ 391 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 392 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 393 { 394 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 395 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 396 const PetscScalar *x,*xb; 397 const MatScalar *v; 398 PetscErrorCode ierr; 399 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 400 const PetscInt *aj=a->j,*ai=a->i,*ib; 401 PetscInt nonzerorow=0; 402 403 PetscFunctionBegin; 404 ierr = VecSet(zz,zero);CHKERRQ(ierr); 405 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 406 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 407 408 v = a->a; 409 xb = x; 410 411 for (i=0; i<mbs; i++) { 412 n = ai[1] - ai[0]; /* length of i_th block row of A */ 413 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 414 ib = aj + *ai; 415 jmin = 0; 416 nonzerorow += (n>0); 417 if (*ib == i) { /* (diag of A)*x */ 418 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 419 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 420 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 421 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 422 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 423 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 424 v += 36; jmin++; 425 } 426 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 427 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 428 for (j=jmin; j<n; j++) { 429 /* (strict lower triangular part of A)*x */ 430 cval = ib[j]*6; 431 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 432 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 433 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 434 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 435 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 436 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 437 /* (strict upper triangular part of A)*x */ 438 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 439 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 440 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 441 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 442 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 443 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 444 v += 36; 445 } 446 xb +=6; ai++; 447 } 448 449 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 450 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 451 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 452 PetscFunctionReturn(0); 453 } 454 #undef __FUNCT__ 455 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 456 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 457 { 458 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 459 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 460 const PetscScalar *x,*xb; 461 const MatScalar *v; 462 PetscErrorCode ierr; 463 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 464 const PetscInt *aj=a->j,*ai=a->i,*ib; 465 PetscInt nonzerorow=0; 466 467 PetscFunctionBegin; 468 ierr = VecSet(zz,zero);CHKERRQ(ierr); 469 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 470 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 471 472 v = a->a; 473 xb = x; 474 475 for (i=0; i<mbs; i++) { 476 n = ai[1] - ai[0]; /* length of i_th block row of A */ 477 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 478 ib = aj + *ai; 479 jmin = 0; 480 nonzerorow += (n>0); 481 if (*ib == i) { /* (diag of A)*x */ 482 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 483 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 484 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 485 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 486 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 487 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 488 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 489 v += 49; jmin++; 490 } 491 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 492 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 493 for (j=jmin; j<n; j++) { 494 /* (strict lower triangular part of A)*x */ 495 cval = ib[j]*7; 496 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 497 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 498 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 499 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 500 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 501 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 502 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 503 /* (strict upper triangular part of A)*x */ 504 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 505 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 506 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 507 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 508 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 509 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 510 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 511 v += 49; 512 } 513 xb +=7; ai++; 514 } 515 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 516 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 517 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 518 PetscFunctionReturn(0); 519 } 520 521 /* 522 This will not work with MatScalar == float because it calls the BLAS 523 */ 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 526 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 527 { 528 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 529 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 530 const PetscScalar *x,*x_ptr,*xb; 531 const MatScalar *v; 532 PetscErrorCode ierr; 533 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 534 const PetscInt *idx,*aj,*ii; 535 PetscInt nonzerorow=0; 536 537 PetscFunctionBegin; 538 ierr = VecSet(zz,zero);CHKERRQ(ierr); 539 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 540 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 541 542 aj = a->j; 543 v = a->a; 544 ii = a->i; 545 546 if (!a->mult_work) { 547 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 548 } 549 work = a->mult_work; 550 551 for (i=0; i<mbs; i++) { 552 n = ii[1] - ii[0]; ncols = n*bs; 553 workt = work; idx=aj+ii[0]; 554 nonzerorow += (n>0); 555 556 /* upper triangular part */ 557 for (j=0; j<n; j++) { 558 xb = x_ptr + bs*(*idx++); 559 for (k=0; k<bs; k++) workt[k] = xb[k]; 560 workt += bs; 561 } 562 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 563 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 564 565 /* strict lower triangular part */ 566 idx = aj+ii[0]; 567 if (*idx == i) { 568 ncols -= bs; v += bs2; idx++; n--; 569 } 570 571 if (ncols > 0) { 572 workt = work; 573 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 574 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 575 for (j=0; j<n; j++) { 576 zb = z_ptr + bs*(*idx++); 577 for (k=0; k<bs; k++) zb[k] += workt[k]; 578 workt += bs; 579 } 580 } 581 x += bs; v += n*bs2; z += bs; ii++; 582 } 583 584 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 585 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 586 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 587 PetscFunctionReturn(0); 588 } 589 590 #undef __FUNCT__ 591 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 592 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 593 { 594 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 595 PetscScalar *z,x1; 596 const PetscScalar *x,*xb; 597 const MatScalar *v; 598 PetscErrorCode ierr; 599 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 600 const PetscInt *aj=a->j,*ai=a->i,*ib; 601 PetscInt nonzerorow=0; 602 603 PetscFunctionBegin; 604 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 605 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 606 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 607 v = a->a; 608 xb = x; 609 610 for (i=0; i<mbs; i++) { 611 n = ai[1] - ai[0]; /* length of i_th row of A */ 612 x1 = xb[0]; 613 ib = aj + *ai; 614 jmin = 0; 615 nonzerorow += (n>0); 616 if (*ib == i) { /* (diag of A)*x */ 617 z[i] += *v++ * x[*ib++]; jmin++; 618 } 619 for (j=jmin; j<n; j++) { 620 cval = *ib; 621 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 622 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 623 } 624 xb++; ai++; 625 } 626 627 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 628 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 629 630 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 631 PetscFunctionReturn(0); 632 } 633 634 #undef __FUNCT__ 635 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 636 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 637 { 638 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 639 PetscScalar *z,x1,x2; 640 const PetscScalar *x,*xb; 641 const MatScalar *v; 642 PetscErrorCode ierr; 643 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 644 const PetscInt *aj=a->j,*ai=a->i,*ib; 645 PetscInt nonzerorow=0; 646 647 PetscFunctionBegin; 648 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 649 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 650 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 651 652 v = a->a; 653 xb = x; 654 655 for (i=0; i<mbs; i++) { 656 n = ai[1] - ai[0]; /* length of i_th block row of A */ 657 x1 = xb[0]; x2 = xb[1]; 658 ib = aj + *ai; 659 jmin = 0; 660 nonzerorow += (n>0); 661 if (*ib == i) { /* (diag of A)*x */ 662 z[2*i] += v[0]*x1 + v[2]*x2; 663 z[2*i+1] += v[2]*x1 + v[3]*x2; 664 v += 4; jmin++; 665 } 666 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 667 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 668 for (j=jmin; j<n; j++) { 669 /* (strict lower triangular part of A)*x */ 670 cval = ib[j]*2; 671 z[cval] += v[0]*x1 + v[1]*x2; 672 z[cval+1] += v[2]*x1 + v[3]*x2; 673 /* (strict upper triangular part of A)*x */ 674 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 675 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 676 v += 4; 677 } 678 xb +=2; ai++; 679 } 680 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 681 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 682 683 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 684 PetscFunctionReturn(0); 685 } 686 687 #undef __FUNCT__ 688 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 689 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 690 { 691 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 692 PetscScalar *z,x1,x2,x3; 693 const PetscScalar *x,*xb; 694 const MatScalar *v; 695 PetscErrorCode ierr; 696 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 697 const PetscInt *aj=a->j,*ai=a->i,*ib; 698 PetscInt nonzerorow=0; 699 700 PetscFunctionBegin; 701 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 702 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 703 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 704 705 v = a->a; 706 xb = x; 707 708 for (i=0; i<mbs; i++) { 709 n = ai[1] - ai[0]; /* length of i_th block row of A */ 710 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 711 ib = aj + *ai; 712 jmin = 0; 713 nonzerorow += (n>0); 714 if (*ib == i) { /* (diag of A)*x */ 715 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 716 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 717 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 718 v += 9; jmin++; 719 } 720 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 721 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 722 for (j=jmin; j<n; j++) { 723 /* (strict lower triangular part of A)*x */ 724 cval = ib[j]*3; 725 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 726 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 727 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 728 /* (strict upper triangular part of A)*x */ 729 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 730 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 731 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 732 v += 9; 733 } 734 xb +=3; ai++; 735 } 736 737 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 738 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 739 740 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 741 PetscFunctionReturn(0); 742 } 743 744 #undef __FUNCT__ 745 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 746 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 747 { 748 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 749 PetscScalar *z,x1,x2,x3,x4; 750 const PetscScalar *x,*xb; 751 const MatScalar *v; 752 PetscErrorCode ierr; 753 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 754 const PetscInt *aj=a->j,*ai=a->i,*ib; 755 PetscInt nonzerorow=0; 756 757 PetscFunctionBegin; 758 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 759 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 760 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 761 762 v = a->a; 763 xb = x; 764 765 for (i=0; i<mbs; i++) { 766 n = ai[1] - ai[0]; /* length of i_th block row of A */ 767 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 768 ib = aj + *ai; 769 jmin = 0; 770 nonzerorow += (n>0); 771 if (*ib == i) { /* (diag of A)*x */ 772 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 773 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 774 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 775 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 776 v += 16; jmin++; 777 } 778 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 779 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 780 for (j=jmin; j<n; j++) { 781 /* (strict lower triangular part of A)*x */ 782 cval = ib[j]*4; 783 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 784 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 785 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 786 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 787 /* (strict upper triangular part of A)*x */ 788 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 789 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 790 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 791 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 792 v += 16; 793 } 794 xb +=4; ai++; 795 } 796 797 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 798 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 799 800 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 801 PetscFunctionReturn(0); 802 } 803 804 #undef __FUNCT__ 805 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 806 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 807 { 808 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 809 PetscScalar *z,x1,x2,x3,x4,x5; 810 const PetscScalar *x,*xb; 811 const MatScalar *v; 812 PetscErrorCode ierr; 813 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 814 const PetscInt *aj=a->j,*ai=a->i,*ib; 815 PetscInt nonzerorow=0; 816 817 PetscFunctionBegin; 818 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 819 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 820 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 821 822 v = a->a; 823 xb = x; 824 825 for (i=0; i<mbs; i++) { 826 n = ai[1] - ai[0]; /* length of i_th block row of A */ 827 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 828 ib = aj + *ai; 829 jmin = 0; 830 nonzerorow += (n>0); 831 if (*ib == i) { /* (diag of A)*x */ 832 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 833 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 834 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 835 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 836 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 837 v += 25; jmin++; 838 } 839 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 840 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 841 for (j=jmin; j<n; j++) { 842 /* (strict lower triangular part of A)*x */ 843 cval = ib[j]*5; 844 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 845 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 846 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 847 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 848 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 849 /* (strict upper triangular part of A)*x */ 850 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 851 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 852 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 853 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 854 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 855 v += 25; 856 } 857 xb +=5; ai++; 858 } 859 860 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 861 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 862 863 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 864 PetscFunctionReturn(0); 865 } 866 #undef __FUNCT__ 867 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 868 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 869 { 870 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 871 PetscScalar *z,x1,x2,x3,x4,x5,x6; 872 const PetscScalar *x,*xb; 873 const MatScalar *v; 874 PetscErrorCode ierr; 875 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 876 const PetscInt *aj=a->j,*ai=a->i,*ib; 877 PetscInt nonzerorow=0; 878 879 PetscFunctionBegin; 880 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 881 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 882 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 883 884 v = a->a; 885 xb = x; 886 887 for (i=0; i<mbs; i++) { 888 n = ai[1] - ai[0]; /* length of i_th block row of A */ 889 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 890 ib = aj + *ai; 891 jmin = 0; 892 nonzerorow += (n>0); 893 if (*ib == i) { /* (diag of A)*x */ 894 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 895 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 896 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 897 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 898 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 899 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 900 v += 36; jmin++; 901 } 902 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 903 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 904 for (j=jmin; j<n; j++) { 905 /* (strict lower triangular part of A)*x */ 906 cval = ib[j]*6; 907 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 908 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 909 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 910 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 911 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 912 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 913 /* (strict upper triangular part of A)*x */ 914 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 915 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 916 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 917 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 918 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 919 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 920 v += 36; 921 } 922 xb +=6; ai++; 923 } 924 925 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 926 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 927 928 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 929 PetscFunctionReturn(0); 930 } 931 932 #undef __FUNCT__ 933 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 934 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 935 { 936 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 937 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 938 const PetscScalar *x,*xb; 939 const MatScalar *v; 940 PetscErrorCode ierr; 941 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 942 const PetscInt *aj=a->j,*ai=a->i,*ib; 943 PetscInt nonzerorow=0; 944 945 PetscFunctionBegin; 946 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 947 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 948 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 949 950 v = a->a; 951 xb = x; 952 953 for (i=0; i<mbs; i++) { 954 n = ai[1] - ai[0]; /* length of i_th block row of A */ 955 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 956 ib = aj + *ai; 957 jmin = 0; 958 nonzerorow += (n>0); 959 if (*ib == i) { /* (diag of A)*x */ 960 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 961 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 962 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 963 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 964 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 965 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 966 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 967 v += 49; jmin++; 968 } 969 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 970 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 971 for (j=jmin; j<n; j++) { 972 /* (strict lower triangular part of A)*x */ 973 cval = ib[j]*7; 974 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 975 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 976 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 977 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 978 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 979 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 980 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 981 /* (strict upper triangular part of A)*x */ 982 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 983 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 984 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 985 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 986 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 987 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 988 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 989 v += 49; 990 } 991 xb +=7; ai++; 992 } 993 994 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 995 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 996 997 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 998 PetscFunctionReturn(0); 999 } 1000 1001 #undef __FUNCT__ 1002 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1003 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1004 { 1005 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1006 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1007 const PetscScalar *x,*x_ptr,*xb; 1008 const MatScalar *v; 1009 PetscErrorCode ierr; 1010 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1011 const PetscInt *idx,*aj,*ii; 1012 PetscInt nonzerorow=0; 1013 1014 PetscFunctionBegin; 1015 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1016 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1017 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1018 1019 aj = a->j; 1020 v = a->a; 1021 ii = a->i; 1022 1023 if (!a->mult_work) { 1024 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1025 } 1026 work = a->mult_work; 1027 1028 1029 for (i=0; i<mbs; i++) { 1030 n = ii[1] - ii[0]; ncols = n*bs; 1031 workt = work; idx=aj+ii[0]; 1032 nonzerorow += (n>0); 1033 1034 /* upper triangular part */ 1035 for (j=0; j<n; j++) { 1036 xb = x_ptr + bs*(*idx++); 1037 for (k=0; k<bs; k++) workt[k] = xb[k]; 1038 workt += bs; 1039 } 1040 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1041 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1042 1043 /* strict lower triangular part */ 1044 idx = aj+ii[0]; 1045 if (*idx == i) { 1046 ncols -= bs; v += bs2; idx++; n--; 1047 } 1048 if (ncols > 0) { 1049 workt = work; 1050 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1051 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1052 for (j=0; j<n; j++) { 1053 zb = z_ptr + bs*(*idx++); 1054 for (k=0; k<bs; k++) zb[k] += workt[k]; 1055 workt += bs; 1056 } 1057 } 1058 1059 x += bs; v += n*bs2; z += bs; ii++; 1060 } 1061 1062 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1063 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1064 1065 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1066 PetscFunctionReturn(0); 1067 } 1068 1069 #undef __FUNCT__ 1070 #define __FUNCT__ "MatScale_SeqSBAIJ" 1071 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1072 { 1073 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1074 PetscScalar oalpha = alpha; 1075 PetscErrorCode ierr; 1076 PetscBLASInt one = 1,totalnz; 1077 1078 PetscFunctionBegin; 1079 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1080 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1081 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1082 PetscFunctionReturn(0); 1083 } 1084 1085 #undef __FUNCT__ 1086 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1087 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1088 { 1089 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1090 const MatScalar *v = a->a; 1091 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1092 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1093 PetscErrorCode ierr; 1094 const PetscInt *aj=a->j,*col; 1095 1096 PetscFunctionBegin; 1097 if (type == NORM_FROBENIUS) { 1098 for (k=0; k<mbs; k++) { 1099 jmin = a->i[k]; jmax = a->i[k+1]; 1100 col = aj + jmin; 1101 if (*col == k) { /* diagonal block */ 1102 for (i=0; i<bs2; i++) { 1103 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1104 } 1105 jmin++; 1106 } 1107 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1108 for (i=0; i<bs2; i++) { 1109 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1110 } 1111 } 1112 } 1113 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1114 ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr); 1115 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1116 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1117 for (i=0; i<mbs; i++) jl[i] = mbs; 1118 il[0] = 0; 1119 1120 *norm = 0.0; 1121 for (k=0; k<mbs; k++) { /* k_th block row */ 1122 for (j=0; j<bs; j++) sum[j]=0.0; 1123 /*-- col sum --*/ 1124 i = jl[k]; /* first |A(i,k)| to be added */ 1125 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1126 at step k */ 1127 while (i<mbs) { 1128 nexti = jl[i]; /* next block row to be added */ 1129 ik = il[i]; /* block index of A(i,k) in the array a */ 1130 for (j=0; j<bs; j++) { 1131 v = a->a + ik*bs2 + j*bs; 1132 for (k1=0; k1<bs; k1++) { 1133 sum[j] += PetscAbsScalar(*v); v++; 1134 } 1135 } 1136 /* update il, jl */ 1137 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1138 jmax = a->i[i+1]; 1139 if (jmin < jmax) { 1140 il[i] = jmin; 1141 j = a->j[jmin]; 1142 jl[i] = jl[j]; jl[j]=i; 1143 } 1144 i = nexti; 1145 } 1146 /*-- row sum --*/ 1147 jmin = a->i[k]; jmax = a->i[k+1]; 1148 for (i=jmin; i<jmax; i++) { 1149 for (j=0; j<bs; j++) { 1150 v = a->a + i*bs2 + j; 1151 for (k1=0; k1<bs; k1++) { 1152 sum[j] += PetscAbsScalar(*v); v += bs; 1153 } 1154 } 1155 } 1156 /* add k_th block row to il, jl */ 1157 col = aj+jmin; 1158 if (*col == k) jmin++; 1159 if (jmin < jmax) { 1160 il[k] = jmin; 1161 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1162 } 1163 for (j=0; j<bs; j++) { 1164 if (sum[j] > *norm) *norm = sum[j]; 1165 } 1166 } 1167 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr); 1169 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1170 PetscFunctionReturn(0); 1171 } 1172 1173 #undef __FUNCT__ 1174 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1175 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1176 { 1177 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1178 PetscErrorCode ierr; 1179 1180 PetscFunctionBegin; 1181 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1182 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1183 *flg = PETSC_FALSE; 1184 PetscFunctionReturn(0); 1185 } 1186 1187 /* if the a->i are the same */ 1188 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1189 if (!*flg) PetscFunctionReturn(0); 1190 1191 /* if a->j are the same */ 1192 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1193 if (!*flg) PetscFunctionReturn(0); 1194 1195 /* if a->a are the same */ 1196 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1197 PetscFunctionReturn(0); 1198 } 1199 1200 #undef __FUNCT__ 1201 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1202 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1203 { 1204 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1205 PetscErrorCode ierr; 1206 PetscInt i,j,k,row,bs,ambs,bs2; 1207 const PetscInt *ai,*aj; 1208 PetscScalar *x,zero = 0.0; 1209 const MatScalar *aa,*aa_j; 1210 1211 PetscFunctionBegin; 1212 bs = A->rmap->bs; 1213 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1214 1215 aa = a->a; 1216 ambs = a->mbs; 1217 1218 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1219 PetscInt *diag=a->diag; 1220 aa = a->a; 1221 ambs = a->mbs; 1222 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1223 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1224 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1225 PetscFunctionReturn(0); 1226 } 1227 1228 ai = a->i; 1229 aj = a->j; 1230 bs2 = a->bs2; 1231 ierr = VecSet(v,zero);CHKERRQ(ierr); 1232 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1233 for (i=0; i<ambs; i++) { 1234 j=ai[i]; 1235 if (aj[j] == i) { /* if this is a diagonal element */ 1236 row = i*bs; 1237 aa_j = aa + j*bs2; 1238 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1239 } 1240 } 1241 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1242 PetscFunctionReturn(0); 1243 } 1244 1245 #undef __FUNCT__ 1246 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1247 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1248 { 1249 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1250 PetscScalar x; 1251 const PetscScalar *l,*li,*ri; 1252 MatScalar *aa,*v; 1253 PetscErrorCode ierr; 1254 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1255 PetscBool flg; 1256 1257 PetscFunctionBegin; 1258 if (ll != rr) { 1259 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1260 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1261 } 1262 if (!ll) PetscFunctionReturn(0); 1263 ai = a->i; 1264 aj = a->j; 1265 aa = a->a; 1266 m = A->rmap->N; 1267 bs = A->rmap->bs; 1268 mbs = a->mbs; 1269 bs2 = a->bs2; 1270 1271 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1272 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1273 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1274 for (i=0; i<mbs; i++) { /* for each block row */ 1275 M = ai[i+1] - ai[i]; 1276 li = l + i*bs; 1277 v = aa + bs2*ai[i]; 1278 for (j=0; j<M; j++) { /* for each block */ 1279 ri = l + bs*aj[ai[i]+j]; 1280 for (k=0; k<bs; k++) { 1281 x = ri[k]; 1282 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1283 } 1284 } 1285 } 1286 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1287 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1288 PetscFunctionReturn(0); 1289 } 1290 1291 #undef __FUNCT__ 1292 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1293 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1294 { 1295 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1296 1297 PetscFunctionBegin; 1298 info->block_size = a->bs2; 1299 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1300 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1301 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1302 info->assemblies = A->num_ass; 1303 info->mallocs = A->info.mallocs; 1304 info->memory = ((PetscObject)A)->mem; 1305 if (A->factortype) { 1306 info->fill_ratio_given = A->info.fill_ratio_given; 1307 info->fill_ratio_needed = A->info.fill_ratio_needed; 1308 info->factor_mallocs = A->info.factor_mallocs; 1309 } else { 1310 info->fill_ratio_given = 0; 1311 info->fill_ratio_needed = 0; 1312 info->factor_mallocs = 0; 1313 } 1314 PetscFunctionReturn(0); 1315 } 1316 1317 1318 #undef __FUNCT__ 1319 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1320 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1321 { 1322 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1323 PetscErrorCode ierr; 1324 1325 PetscFunctionBegin; 1326 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1327 PetscFunctionReturn(0); 1328 } 1329 1330 #undef __FUNCT__ 1331 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1332 /* 1333 This code does not work since it only checks the upper triangular part of 1334 the matrix. Hence it is not listed in the function table. 1335 */ 1336 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1337 { 1338 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1339 PetscErrorCode ierr; 1340 PetscInt i,j,n,row,col,bs,mbs; 1341 const PetscInt *ai,*aj; 1342 PetscReal atmp; 1343 const MatScalar *aa; 1344 PetscScalar *x; 1345 PetscInt ncols,brow,bcol,krow,kcol; 1346 1347 PetscFunctionBegin; 1348 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1349 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1350 bs = A->rmap->bs; 1351 aa = a->a; 1352 ai = a->i; 1353 aj = a->j; 1354 mbs = a->mbs; 1355 1356 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1357 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1358 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1359 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1360 for (i=0; i<mbs; i++) { 1361 ncols = ai[1] - ai[0]; ai++; 1362 brow = bs*i; 1363 for (j=0; j<ncols; j++) { 1364 bcol = bs*(*aj); 1365 for (kcol=0; kcol<bs; kcol++) { 1366 col = bcol + kcol; /* col index */ 1367 for (krow=0; krow<bs; krow++) { 1368 atmp = PetscAbsScalar(*aa); aa++; 1369 row = brow + krow; /* row index */ 1370 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1371 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1372 } 1373 } 1374 aj++; 1375 } 1376 } 1377 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1378 PetscFunctionReturn(0); 1379 } 1380