1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n; ++j) { 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if (!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++) { 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end; l++) { 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) { 63 nidx[isz++] = bcol; 64 if (bcol_max < bcol) bcol_max = bcol; 65 } 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end; l++) { 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table_in,bcol)) { 74 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 98 { 99 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 100 PetscErrorCode ierr; 101 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 102 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 103 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 104 const PetscInt *irow,*aj = a->j,*ai = a->i; 105 MatScalar *mat_a; 106 Mat C; 107 PetscBool flag,sorted; 108 109 PetscFunctionBegin; 110 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 111 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 112 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 113 114 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 115 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 116 117 ierr = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 ssmap = smap; 120 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 121 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 122 /* determine lens of each row */ 123 for (i=0; i<nrows; i++) { 124 kstart = ai[irow[i]]; 125 kend = kstart + a->ilen[irow[i]]; 126 lens[i] = 0; 127 for (k=kstart; k<kend; k++) { 128 if (ssmap[aj[k]]) lens[i]++; 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ*)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 138 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 139 C = *B; 140 } else { 141 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 142 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 143 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 144 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 145 } 146 c = (Mat_SeqSBAIJ*)(C->data); 147 for (i=0; i<nrows; i++) { 148 row = irow[i]; 149 kstart = ai[row]; 150 kend = kstart + a->ilen[row]; 151 mat_i = c->i[i]; 152 mat_j = c->j + mat_i; 153 mat_a = c->a + mat_i*bs2; 154 mat_ilen = c->ilen + i; 155 for (k=kstart; k<kend; k++) { 156 if ((tcol=ssmap[a->j[k]])) { 157 *mat_j++ = tcol - 1; 158 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 159 mat_a += bs2; 160 (*mat_ilen)++; 161 } 162 } 163 } 164 165 /* Free work space */ 166 ierr = PetscFree(smap);CHKERRQ(ierr); 167 ierr = PetscFree(lens);CHKERRQ(ierr); 168 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 169 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 170 171 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 172 *B = C; 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 179 { 180 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 181 IS is1; 182 PetscErrorCode ierr; 183 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 184 const PetscInt *irow; 185 186 PetscFunctionBegin; 187 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 188 189 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 190 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 191 192 /* Verify if the indices corespond to each element in a block 193 and form the IS with compressed IS */ 194 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 195 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 196 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 197 198 count = 0; 199 for (i=0; i<a->mbs; i++) { 200 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 201 if (vary[i]==bs) iary[count++] = i; 202 } 203 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 204 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 205 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 206 207 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 208 ierr = ISDestroy(&is1);CHKERRQ(ierr); 209 PetscFunctionReturn(0); 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 214 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 215 { 216 PetscErrorCode ierr; 217 PetscInt i; 218 219 PetscFunctionBegin; 220 if (scall == MAT_INITIAL_MATRIX) { 221 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 222 } 223 224 for (i=0; i<n; i++) { 225 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 226 } 227 PetscFunctionReturn(0); 228 } 229 230 /* -------------------------------------------------------*/ 231 /* Should check that shapes of vectors and matrices match */ 232 /* -------------------------------------------------------*/ 233 234 #undef __FUNCT__ 235 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 236 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 237 { 238 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 239 PetscScalar *z,x1,x2,zero=0.0; 240 const PetscScalar *x,*xb; 241 const MatScalar *v; 242 PetscErrorCode ierr; 243 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 244 const PetscInt *aj=a->j,*ai=a->i,*ib; 245 PetscInt nonzerorow=0; 246 247 PetscFunctionBegin; 248 ierr = VecSet(zz,zero);CHKERRQ(ierr); 249 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 250 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 251 252 v = a->a; 253 xb = x; 254 255 for (i=0; i<mbs; i++) { 256 n = ai[1] - ai[0]; /* length of i_th block row of A */ 257 x1 = xb[0]; x2 = xb[1]; 258 ib = aj + *ai; 259 jmin = 0; 260 nonzerorow += (n>0); 261 if (*ib == i) { /* (diag of A)*x */ 262 z[2*i] += v[0]*x1 + v[2]*x2; 263 z[2*i+1] += v[2]*x1 + v[3]*x2; 264 v += 4; jmin++; 265 } 266 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 267 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 268 for (j=jmin; j<n; j++) { 269 /* (strict lower triangular part of A)*x */ 270 cval = ib[j]*2; 271 z[cval] += v[0]*x1 + v[1]*x2; 272 z[cval+1] += v[2]*x1 + v[3]*x2; 273 /* (strict upper triangular part of A)*x */ 274 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 275 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 276 v += 4; 277 } 278 xb +=2; ai++; 279 } 280 281 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 282 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 283 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 284 PetscFunctionReturn(0); 285 } 286 287 #undef __FUNCT__ 288 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 289 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 290 { 291 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 292 PetscScalar *z,x1,x2,x3,zero=0.0; 293 const PetscScalar *x,*xb; 294 const MatScalar *v; 295 PetscErrorCode ierr; 296 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 297 const PetscInt *aj = a->j,*ai = a->i,*ib; 298 PetscInt nonzerorow=0; 299 300 PetscFunctionBegin; 301 ierr = VecSet(zz,zero);CHKERRQ(ierr); 302 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 303 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 304 305 v = a->a; 306 xb = x; 307 308 for (i=0; i<mbs; i++) { 309 n = ai[1] - ai[0]; /* length of i_th block row of A */ 310 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 311 ib = aj + *ai; 312 jmin = 0; 313 nonzerorow += (n>0); 314 if (*ib == i) { /* (diag of A)*x */ 315 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 316 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 317 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 318 v += 9; jmin++; 319 } 320 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 321 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 322 for (j=jmin; j<n; j++) { 323 /* (strict lower triangular part of A)*x */ 324 cval = ib[j]*3; 325 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 326 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 327 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 328 /* (strict upper triangular part of A)*x */ 329 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 330 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 331 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 332 v += 9; 333 } 334 xb +=3; ai++; 335 } 336 337 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 338 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 339 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 340 PetscFunctionReturn(0); 341 } 342 343 #undef __FUNCT__ 344 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 345 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 346 { 347 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 348 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 349 const PetscScalar *x,*xb; 350 const MatScalar *v; 351 PetscErrorCode ierr; 352 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 353 const PetscInt *aj = a->j,*ai = a->i,*ib; 354 PetscInt nonzerorow = 0; 355 356 PetscFunctionBegin; 357 ierr = VecSet(zz,zero);CHKERRQ(ierr); 358 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 359 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 360 361 v = a->a; 362 xb = x; 363 364 for (i=0; i<mbs; i++) { 365 n = ai[1] - ai[0]; /* length of i_th block row of A */ 366 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 367 ib = aj + *ai; 368 jmin = 0; 369 nonzerorow += (n>0); 370 if (*ib == i) { /* (diag of A)*x */ 371 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 372 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 373 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 374 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 375 v += 16; jmin++; 376 } 377 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 378 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 379 for (j=jmin; j<n; j++) { 380 /* (strict lower triangular part of A)*x */ 381 cval = ib[j]*4; 382 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 383 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 384 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 385 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 386 /* (strict upper triangular part of A)*x */ 387 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 388 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 389 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 390 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 391 v += 16; 392 } 393 xb +=4; ai++; 394 } 395 396 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 397 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 398 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 399 PetscFunctionReturn(0); 400 } 401 402 #undef __FUNCT__ 403 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 404 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 405 { 406 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 407 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 408 const PetscScalar *x,*xb; 409 const MatScalar *v; 410 PetscErrorCode ierr; 411 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 412 const PetscInt *aj = a->j,*ai = a->i,*ib; 413 PetscInt nonzerorow=0; 414 415 PetscFunctionBegin; 416 ierr = VecSet(zz,zero);CHKERRQ(ierr); 417 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 418 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 419 420 v = a->a; 421 xb = x; 422 423 for (i=0; i<mbs; i++) { 424 n = ai[1] - ai[0]; /* length of i_th block row of A */ 425 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 426 ib = aj + *ai; 427 jmin = 0; 428 nonzerorow += (n>0); 429 if (*ib == i) { /* (diag of A)*x */ 430 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 431 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 432 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 433 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 434 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 435 v += 25; jmin++; 436 } 437 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 438 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 439 for (j=jmin; j<n; j++) { 440 /* (strict lower triangular part of A)*x */ 441 cval = ib[j]*5; 442 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 443 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 444 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 445 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 446 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 447 /* (strict upper triangular part of A)*x */ 448 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 449 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 450 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 451 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 452 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 453 v += 25; 454 } 455 xb +=5; ai++; 456 } 457 458 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 459 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 460 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 461 PetscFunctionReturn(0); 462 } 463 464 465 #undef __FUNCT__ 466 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 467 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 468 { 469 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 470 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 471 const PetscScalar *x,*xb; 472 const MatScalar *v; 473 PetscErrorCode ierr; 474 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 475 const PetscInt *aj=a->j,*ai=a->i,*ib; 476 PetscInt nonzerorow=0; 477 478 PetscFunctionBegin; 479 ierr = VecSet(zz,zero);CHKERRQ(ierr); 480 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 481 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 482 483 v = a->a; 484 xb = x; 485 486 for (i=0; i<mbs; i++) { 487 n = ai[1] - ai[0]; /* length of i_th block row of A */ 488 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 489 ib = aj + *ai; 490 jmin = 0; 491 nonzerorow += (n>0); 492 if (*ib == i) { /* (diag of A)*x */ 493 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 494 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 495 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 496 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 497 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 498 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 499 v += 36; jmin++; 500 } 501 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 502 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 503 for (j=jmin; j<n; j++) { 504 /* (strict lower triangular part of A)*x */ 505 cval = ib[j]*6; 506 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 507 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 508 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 509 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 510 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 511 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 512 /* (strict upper triangular part of A)*x */ 513 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 514 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 515 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 516 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 517 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 518 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 519 v += 36; 520 } 521 xb +=6; ai++; 522 } 523 524 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 525 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 526 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 527 PetscFunctionReturn(0); 528 } 529 #undef __FUNCT__ 530 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 531 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 532 { 533 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 534 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 535 const PetscScalar *x,*xb; 536 const MatScalar *v; 537 PetscErrorCode ierr; 538 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 539 const PetscInt *aj=a->j,*ai=a->i,*ib; 540 PetscInt nonzerorow=0; 541 542 PetscFunctionBegin; 543 ierr = VecSet(zz,zero);CHKERRQ(ierr); 544 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 545 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 546 547 v = a->a; 548 xb = x; 549 550 for (i=0; i<mbs; i++) { 551 n = ai[1] - ai[0]; /* length of i_th block row of A */ 552 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 553 ib = aj + *ai; 554 jmin = 0; 555 nonzerorow += (n>0); 556 if (*ib == i) { /* (diag of A)*x */ 557 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 558 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 559 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 560 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 561 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 562 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 563 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 564 v += 49; jmin++; 565 } 566 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 567 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 568 for (j=jmin; j<n; j++) { 569 /* (strict lower triangular part of A)*x */ 570 cval = ib[j]*7; 571 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 572 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 573 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 574 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 575 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 576 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 577 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 578 /* (strict upper triangular part of A)*x */ 579 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 580 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 581 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 582 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 583 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 584 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 585 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 586 v += 49; 587 } 588 xb +=7; ai++; 589 } 590 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 591 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 592 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 593 PetscFunctionReturn(0); 594 } 595 596 /* 597 This will not work with MatScalar == float because it calls the BLAS 598 */ 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 601 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 602 { 603 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 604 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 605 const PetscScalar *x,*x_ptr,*xb; 606 const MatScalar *v; 607 PetscErrorCode ierr; 608 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 609 const PetscInt *idx,*aj,*ii; 610 PetscInt nonzerorow=0; 611 612 PetscFunctionBegin; 613 ierr = VecSet(zz,zero);CHKERRQ(ierr); 614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 615 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 616 617 aj = a->j; 618 v = a->a; 619 ii = a->i; 620 621 if (!a->mult_work) { 622 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 623 } 624 work = a->mult_work; 625 626 for (i=0; i<mbs; i++) { 627 n = ii[1] - ii[0]; ncols = n*bs; 628 workt = work; idx=aj+ii[0]; 629 nonzerorow += (n>0); 630 631 /* upper triangular part */ 632 for (j=0; j<n; j++) { 633 xb = x_ptr + bs*(*idx++); 634 for (k=0; k<bs; k++) workt[k] = xb[k]; 635 workt += bs; 636 } 637 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 638 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 639 640 /* strict lower triangular part */ 641 idx = aj+ii[0]; 642 if (*idx == i) { 643 ncols -= bs; v += bs2; idx++; n--; 644 } 645 646 if (ncols > 0) { 647 workt = work; 648 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 649 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 650 for (j=0; j<n; j++) { 651 zb = z_ptr + bs*(*idx++); 652 for (k=0; k<bs; k++) zb[k] += workt[k]; 653 workt += bs; 654 } 655 } 656 x += bs; v += n*bs2; z += bs; ii++; 657 } 658 659 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 660 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 661 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 662 PetscFunctionReturn(0); 663 } 664 665 #undef __FUNCT__ 666 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 667 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 668 { 669 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 670 PetscScalar *z,x1; 671 const PetscScalar *x,*xb; 672 const MatScalar *v; 673 PetscErrorCode ierr; 674 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 675 const PetscInt *aj=a->j,*ai=a->i,*ib; 676 PetscInt nonzerorow=0; 677 678 PetscFunctionBegin; 679 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 680 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 681 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 682 v = a->a; 683 xb = x; 684 685 for (i=0; i<mbs; i++) { 686 n = ai[1] - ai[0]; /* length of i_th row of A */ 687 x1 = xb[0]; 688 ib = aj + *ai; 689 jmin = 0; 690 nonzerorow += (n>0); 691 if (*ib == i) { /* (diag of A)*x */ 692 z[i] += *v++ * x[*ib++]; jmin++; 693 } 694 for (j=jmin; j<n; j++) { 695 cval = *ib; 696 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 697 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 698 } 699 xb++; ai++; 700 } 701 702 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 703 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 704 705 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 706 PetscFunctionReturn(0); 707 } 708 709 #undef __FUNCT__ 710 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 711 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 712 { 713 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 714 PetscScalar *z,x1,x2; 715 const PetscScalar *x,*xb; 716 const MatScalar *v; 717 PetscErrorCode ierr; 718 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 719 const PetscInt *aj=a->j,*ai=a->i,*ib; 720 PetscInt nonzerorow=0; 721 722 PetscFunctionBegin; 723 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 724 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 725 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 726 727 v = a->a; 728 xb = x; 729 730 for (i=0; i<mbs; i++) { 731 n = ai[1] - ai[0]; /* length of i_th block row of A */ 732 x1 = xb[0]; x2 = xb[1]; 733 ib = aj + *ai; 734 jmin = 0; 735 nonzerorow += (n>0); 736 if (*ib == i) { /* (diag of A)*x */ 737 z[2*i] += v[0]*x1 + v[2]*x2; 738 z[2*i+1] += v[2]*x1 + v[3]*x2; 739 v += 4; jmin++; 740 } 741 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 742 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 743 for (j=jmin; j<n; j++) { 744 /* (strict lower triangular part of A)*x */ 745 cval = ib[j]*2; 746 z[cval] += v[0]*x1 + v[1]*x2; 747 z[cval+1] += v[2]*x1 + v[3]*x2; 748 /* (strict upper triangular part of A)*x */ 749 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 750 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 751 v += 4; 752 } 753 xb +=2; ai++; 754 } 755 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 756 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 757 758 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 759 PetscFunctionReturn(0); 760 } 761 762 #undef __FUNCT__ 763 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 764 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 765 { 766 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 767 PetscScalar *z,x1,x2,x3; 768 const PetscScalar *x,*xb; 769 const MatScalar *v; 770 PetscErrorCode ierr; 771 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 772 const PetscInt *aj=a->j,*ai=a->i,*ib; 773 PetscInt nonzerorow=0; 774 775 PetscFunctionBegin; 776 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 777 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 778 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 779 780 v = a->a; 781 xb = x; 782 783 for (i=0; i<mbs; i++) { 784 n = ai[1] - ai[0]; /* length of i_th block row of A */ 785 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 786 ib = aj + *ai; 787 jmin = 0; 788 nonzerorow += (n>0); 789 if (*ib == i) { /* (diag of A)*x */ 790 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 791 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 792 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 793 v += 9; jmin++; 794 } 795 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 796 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 797 for (j=jmin; j<n; j++) { 798 /* (strict lower triangular part of A)*x */ 799 cval = ib[j]*3; 800 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 801 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 802 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 803 /* (strict upper triangular part of A)*x */ 804 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 805 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 806 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 807 v += 9; 808 } 809 xb +=3; ai++; 810 } 811 812 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 813 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 814 815 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 816 PetscFunctionReturn(0); 817 } 818 819 #undef __FUNCT__ 820 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 821 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 822 { 823 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 824 PetscScalar *z,x1,x2,x3,x4; 825 const PetscScalar *x,*xb; 826 const MatScalar *v; 827 PetscErrorCode ierr; 828 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 829 const PetscInt *aj=a->j,*ai=a->i,*ib; 830 PetscInt nonzerorow=0; 831 832 PetscFunctionBegin; 833 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 834 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 835 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 836 837 v = a->a; 838 xb = x; 839 840 for (i=0; i<mbs; i++) { 841 n = ai[1] - ai[0]; /* length of i_th block row of A */ 842 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 843 ib = aj + *ai; 844 jmin = 0; 845 nonzerorow += (n>0); 846 if (*ib == i) { /* (diag of A)*x */ 847 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 848 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 849 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 850 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 851 v += 16; jmin++; 852 } 853 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 854 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 855 for (j=jmin; j<n; j++) { 856 /* (strict lower triangular part of A)*x */ 857 cval = ib[j]*4; 858 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 859 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 860 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 861 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 862 /* (strict upper triangular part of A)*x */ 863 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 864 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 865 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 866 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 867 v += 16; 868 } 869 xb +=4; ai++; 870 } 871 872 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 873 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 874 875 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 876 PetscFunctionReturn(0); 877 } 878 879 #undef __FUNCT__ 880 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 881 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 882 { 883 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 884 PetscScalar *z,x1,x2,x3,x4,x5; 885 const PetscScalar *x,*xb; 886 const MatScalar *v; 887 PetscErrorCode ierr; 888 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 889 const PetscInt *aj=a->j,*ai=a->i,*ib; 890 PetscInt nonzerorow=0; 891 892 PetscFunctionBegin; 893 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 894 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 895 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 896 897 v = a->a; 898 xb = x; 899 900 for (i=0; i<mbs; i++) { 901 n = ai[1] - ai[0]; /* length of i_th block row of A */ 902 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 903 ib = aj + *ai; 904 jmin = 0; 905 nonzerorow += (n>0); 906 if (*ib == i) { /* (diag of A)*x */ 907 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 908 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 909 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 910 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 911 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 912 v += 25; jmin++; 913 } 914 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 915 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 916 for (j=jmin; j<n; j++) { 917 /* (strict lower triangular part of A)*x */ 918 cval = ib[j]*5; 919 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 920 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 921 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 922 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 923 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 924 /* (strict upper triangular part of A)*x */ 925 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 926 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 927 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 928 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 929 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 930 v += 25; 931 } 932 xb +=5; ai++; 933 } 934 935 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 936 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 937 938 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 939 PetscFunctionReturn(0); 940 } 941 #undef __FUNCT__ 942 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 943 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 944 { 945 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 946 PetscScalar *z,x1,x2,x3,x4,x5,x6; 947 const PetscScalar *x,*xb; 948 const MatScalar *v; 949 PetscErrorCode ierr; 950 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 951 const PetscInt *aj=a->j,*ai=a->i,*ib; 952 PetscInt nonzerorow=0; 953 954 PetscFunctionBegin; 955 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 956 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 957 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 958 959 v = a->a; 960 xb = x; 961 962 for (i=0; i<mbs; i++) { 963 n = ai[1] - ai[0]; /* length of i_th block row of A */ 964 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 965 ib = aj + *ai; 966 jmin = 0; 967 nonzerorow += (n>0); 968 if (*ib == i) { /* (diag of A)*x */ 969 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 970 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 971 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 972 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 973 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 974 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 975 v += 36; jmin++; 976 } 977 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 978 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 979 for (j=jmin; j<n; j++) { 980 /* (strict lower triangular part of A)*x */ 981 cval = ib[j]*6; 982 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 983 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 984 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 985 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 986 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 987 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 988 /* (strict upper triangular part of A)*x */ 989 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 990 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 991 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 992 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 993 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 994 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 995 v += 36; 996 } 997 xb +=6; ai++; 998 } 999 1000 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1001 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1002 1003 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1004 PetscFunctionReturn(0); 1005 } 1006 1007 #undef __FUNCT__ 1008 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1009 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1010 { 1011 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1012 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1013 const PetscScalar *x,*xb; 1014 const MatScalar *v; 1015 PetscErrorCode ierr; 1016 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1017 const PetscInt *aj=a->j,*ai=a->i,*ib; 1018 PetscInt nonzerorow=0; 1019 1020 PetscFunctionBegin; 1021 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1022 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1023 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1024 1025 v = a->a; 1026 xb = x; 1027 1028 for (i=0; i<mbs; i++) { 1029 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1030 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1031 ib = aj + *ai; 1032 jmin = 0; 1033 nonzerorow += (n>0); 1034 if (*ib == i) { /* (diag of A)*x */ 1035 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1036 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1037 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1038 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1039 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1040 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1041 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1042 v += 49; jmin++; 1043 } 1044 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1045 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1046 for (j=jmin; j<n; j++) { 1047 /* (strict lower triangular part of A)*x */ 1048 cval = ib[j]*7; 1049 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1050 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1051 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1052 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1053 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1054 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1055 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1056 /* (strict upper triangular part of A)*x */ 1057 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1058 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1059 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1060 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1061 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1062 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1063 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1064 v += 49; 1065 } 1066 xb +=7; ai++; 1067 } 1068 1069 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1070 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1071 1072 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1073 PetscFunctionReturn(0); 1074 } 1075 1076 #undef __FUNCT__ 1077 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1078 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1079 { 1080 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1081 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1082 const PetscScalar *x,*x_ptr,*xb; 1083 const MatScalar *v; 1084 PetscErrorCode ierr; 1085 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1086 const PetscInt *idx,*aj,*ii; 1087 PetscInt nonzerorow=0; 1088 1089 PetscFunctionBegin; 1090 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1091 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1092 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1093 1094 aj = a->j; 1095 v = a->a; 1096 ii = a->i; 1097 1098 if (!a->mult_work) { 1099 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1100 } 1101 work = a->mult_work; 1102 1103 1104 for (i=0; i<mbs; i++) { 1105 n = ii[1] - ii[0]; ncols = n*bs; 1106 workt = work; idx=aj+ii[0]; 1107 nonzerorow += (n>0); 1108 1109 /* upper triangular part */ 1110 for (j=0; j<n; j++) { 1111 xb = x_ptr + bs*(*idx++); 1112 for (k=0; k<bs; k++) workt[k] = xb[k]; 1113 workt += bs; 1114 } 1115 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1116 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1117 1118 /* strict lower triangular part */ 1119 idx = aj+ii[0]; 1120 if (*idx == i) { 1121 ncols -= bs; v += bs2; idx++; n--; 1122 } 1123 if (ncols > 0) { 1124 workt = work; 1125 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1126 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1127 for (j=0; j<n; j++) { 1128 zb = z_ptr + bs*(*idx++); 1129 for (k=0; k<bs; k++) zb[k] += workt[k]; 1130 workt += bs; 1131 } 1132 } 1133 1134 x += bs; v += n*bs2; z += bs; ii++; 1135 } 1136 1137 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1138 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1139 1140 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1141 PetscFunctionReturn(0); 1142 } 1143 1144 #undef __FUNCT__ 1145 #define __FUNCT__ "MatScale_SeqSBAIJ" 1146 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1147 { 1148 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1149 PetscScalar oalpha = alpha; 1150 PetscErrorCode ierr; 1151 PetscBLASInt one = 1,totalnz; 1152 1153 PetscFunctionBegin; 1154 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1155 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1156 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1157 PetscFunctionReturn(0); 1158 } 1159 1160 #undef __FUNCT__ 1161 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1162 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1163 { 1164 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1165 const MatScalar *v = a->a; 1166 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1167 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1168 PetscErrorCode ierr; 1169 const PetscInt *aj=a->j,*col; 1170 1171 PetscFunctionBegin; 1172 if (type == NORM_FROBENIUS) { 1173 for (k=0; k<mbs; k++) { 1174 jmin = a->i[k]; jmax = a->i[k+1]; 1175 col = aj + jmin; 1176 if (*col == k) { /* diagonal block */ 1177 for (i=0; i<bs2; i++) { 1178 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1179 } 1180 jmin++; 1181 } 1182 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1183 for (i=0; i<bs2; i++) { 1184 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1185 } 1186 } 1187 } 1188 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1189 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1190 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1191 for (i=0; i<mbs; i++) jl[i] = mbs; 1192 il[0] = 0; 1193 1194 *norm = 0.0; 1195 for (k=0; k<mbs; k++) { /* k_th block row */ 1196 for (j=0; j<bs; j++) sum[j]=0.0; 1197 /*-- col sum --*/ 1198 i = jl[k]; /* first |A(i,k)| to be added */ 1199 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1200 at step k */ 1201 while (i<mbs) { 1202 nexti = jl[i]; /* next block row to be added */ 1203 ik = il[i]; /* block index of A(i,k) in the array a */ 1204 for (j=0; j<bs; j++) { 1205 v = a->a + ik*bs2 + j*bs; 1206 for (k1=0; k1<bs; k1++) { 1207 sum[j] += PetscAbsScalar(*v); v++; 1208 } 1209 } 1210 /* update il, jl */ 1211 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1212 jmax = a->i[i+1]; 1213 if (jmin < jmax) { 1214 il[i] = jmin; 1215 j = a->j[jmin]; 1216 jl[i] = jl[j]; jl[j]=i; 1217 } 1218 i = nexti; 1219 } 1220 /*-- row sum --*/ 1221 jmin = a->i[k]; jmax = a->i[k+1]; 1222 for (i=jmin; i<jmax; i++) { 1223 for (j=0; j<bs; j++) { 1224 v = a->a + i*bs2 + j; 1225 for (k1=0; k1<bs; k1++) { 1226 sum[j] += PetscAbsScalar(*v); v += bs; 1227 } 1228 } 1229 } 1230 /* add k_th block row to il, jl */ 1231 col = aj+jmin; 1232 if (*col == k) jmin++; 1233 if (jmin < jmax) { 1234 il[k] = jmin; 1235 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1236 } 1237 for (j=0; j<bs; j++) { 1238 if (sum[j] > *norm) *norm = sum[j]; 1239 } 1240 } 1241 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1242 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1243 PetscFunctionReturn(0); 1244 } 1245 1246 #undef __FUNCT__ 1247 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1248 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1249 { 1250 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1251 PetscErrorCode ierr; 1252 1253 PetscFunctionBegin; 1254 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1255 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1256 *flg = PETSC_FALSE; 1257 PetscFunctionReturn(0); 1258 } 1259 1260 /* if the a->i are the same */ 1261 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1262 if (!*flg) PetscFunctionReturn(0); 1263 1264 /* if a->j are the same */ 1265 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1266 if (!*flg) PetscFunctionReturn(0); 1267 1268 /* if a->a are the same */ 1269 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1270 PetscFunctionReturn(0); 1271 } 1272 1273 #undef __FUNCT__ 1274 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1275 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1276 { 1277 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1278 PetscErrorCode ierr; 1279 PetscInt i,j,k,row,bs,ambs,bs2; 1280 const PetscInt *ai,*aj; 1281 PetscScalar *x,zero = 0.0; 1282 const MatScalar *aa,*aa_j; 1283 1284 PetscFunctionBegin; 1285 bs = A->rmap->bs; 1286 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1287 1288 aa = a->a; 1289 ambs = a->mbs; 1290 1291 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1292 PetscInt *diag=a->diag; 1293 aa = a->a; 1294 ambs = a->mbs; 1295 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1296 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1297 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1298 PetscFunctionReturn(0); 1299 } 1300 1301 ai = a->i; 1302 aj = a->j; 1303 bs2 = a->bs2; 1304 ierr = VecSet(v,zero);CHKERRQ(ierr); 1305 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1306 for (i=0; i<ambs; i++) { 1307 j=ai[i]; 1308 if (aj[j] == i) { /* if this is a diagonal element */ 1309 row = i*bs; 1310 aa_j = aa + j*bs2; 1311 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1312 } 1313 } 1314 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1315 PetscFunctionReturn(0); 1316 } 1317 1318 #undef __FUNCT__ 1319 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1320 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1321 { 1322 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1323 PetscScalar x; 1324 const PetscScalar *l,*li,*ri; 1325 MatScalar *aa,*v; 1326 PetscErrorCode ierr; 1327 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1328 PetscBool flg; 1329 1330 PetscFunctionBegin; 1331 if (ll != rr) { 1332 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1333 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1334 } 1335 if (!ll) PetscFunctionReturn(0); 1336 ai = a->i; 1337 aj = a->j; 1338 aa = a->a; 1339 m = A->rmap->N; 1340 bs = A->rmap->bs; 1341 mbs = a->mbs; 1342 bs2 = a->bs2; 1343 1344 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1345 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1346 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1347 for (i=0; i<mbs; i++) { /* for each block row */ 1348 M = ai[i+1] - ai[i]; 1349 li = l + i*bs; 1350 v = aa + bs2*ai[i]; 1351 for (j=0; j<M; j++) { /* for each block */ 1352 ri = l + bs*aj[ai[i]+j]; 1353 for (k=0; k<bs; k++) { 1354 x = ri[k]; 1355 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1356 } 1357 } 1358 } 1359 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1360 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1361 PetscFunctionReturn(0); 1362 } 1363 1364 #undef __FUNCT__ 1365 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1366 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1367 { 1368 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1369 1370 PetscFunctionBegin; 1371 info->block_size = a->bs2; 1372 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1373 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1374 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1375 info->assemblies = A->num_ass; 1376 info->mallocs = A->info.mallocs; 1377 info->memory = ((PetscObject)A)->mem; 1378 if (A->factortype) { 1379 info->fill_ratio_given = A->info.fill_ratio_given; 1380 info->fill_ratio_needed = A->info.fill_ratio_needed; 1381 info->factor_mallocs = A->info.factor_mallocs; 1382 } else { 1383 info->fill_ratio_given = 0; 1384 info->fill_ratio_needed = 0; 1385 info->factor_mallocs = 0; 1386 } 1387 PetscFunctionReturn(0); 1388 } 1389 1390 1391 #undef __FUNCT__ 1392 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1393 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1394 { 1395 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1396 PetscErrorCode ierr; 1397 1398 PetscFunctionBegin; 1399 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1400 PetscFunctionReturn(0); 1401 } 1402 1403 #undef __FUNCT__ 1404 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1405 /* 1406 This code does not work since it only checks the upper triangular part of 1407 the matrix. Hence it is not listed in the function table. 1408 */ 1409 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1410 { 1411 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1412 PetscErrorCode ierr; 1413 PetscInt i,j,n,row,col,bs,mbs; 1414 const PetscInt *ai,*aj; 1415 PetscReal atmp; 1416 const MatScalar *aa; 1417 PetscScalar *x; 1418 PetscInt ncols,brow,bcol,krow,kcol; 1419 1420 PetscFunctionBegin; 1421 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1422 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1423 bs = A->rmap->bs; 1424 aa = a->a; 1425 ai = a->i; 1426 aj = a->j; 1427 mbs = a->mbs; 1428 1429 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1430 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1431 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1432 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1433 for (i=0; i<mbs; i++) { 1434 ncols = ai[1] - ai[0]; ai++; 1435 brow = bs*i; 1436 for (j=0; j<ncols; j++) { 1437 bcol = bs*(*aj); 1438 for (kcol=0; kcol<bs; kcol++) { 1439 col = bcol + kcol; /* col index */ 1440 for (krow=0; krow<bs; krow++) { 1441 atmp = PetscAbsScalar(*aa); aa++; 1442 row = brow + krow; /* row index */ 1443 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1444 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1445 } 1446 } 1447 aj++; 1448 } 1449 } 1450 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1451 PetscFunctionReturn(0); 1452 } 1453