1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n; ++j) { 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if (!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++) { /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++) { 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end; l++) { 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) { 63 nidx[isz++] = bcol; 64 if (bcol_max < bcol) bcol_max = bcol; 65 } 66 } 67 k++; 68 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 69 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 70 for (l = start; l<end; l++) { 71 bcol = aj[l]; 72 if (bcol > bcol_max) break; 73 if (PetscBTLookup(table_in,bcol)) { 74 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow; 75 break; /* for l = start; l<end ; l++) */ 76 } 77 } 78 } 79 } 80 } /* for each overlap */ 81 82 /* expand the Index Set */ 83 for (j=0; j<isz; j++) { 84 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) //rm 98 { 99 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 100 PetscErrorCode ierr; 101 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 102 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 103 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 104 const PetscInt *irow,*aj = a->j,*ai = a->i; 105 MatScalar *mat_a; 106 Mat C; 107 PetscBool flag,sorted; 108 109 PetscFunctionBegin; 110 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); //rm!!! 111 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); //rm!!! 112 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); //rm!!! 113 114 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 115 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 116 117 ierr = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 ssmap = smap; 120 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 121 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 122 /* determine lens of each row */ 123 for (i=0; i<nrows; i++) { 124 kstart = ai[irow[i]]; 125 kend = kstart + a->ilen[irow[i]]; 126 lens[i] = 0; 127 for (k=kstart; k<kend; k++) { 128 if (ssmap[aj[k]]) lens[i]++; 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ*)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 138 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 139 C = *B; 140 } else { 141 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 142 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 143 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 144 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 145 } 146 c = (Mat_SeqSBAIJ*)(C->data); 147 for (i=0; i<nrows; i++) { 148 row = irow[i]; 149 kstart = ai[row]; 150 kend = kstart + a->ilen[row]; 151 mat_i = c->i[i]; 152 mat_j = c->j + mat_i; 153 mat_a = c->a + mat_i*bs2; 154 mat_ilen = c->ilen + i; 155 for (k=kstart; k<kend; k++) { 156 if ((tcol=ssmap[a->j[k]])) { 157 *mat_j++ = tcol - 1; 158 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 159 mat_a += bs2; 160 (*mat_ilen)++; 161 } 162 } 163 } 164 165 /* Free work space */ 166 ierr = PetscFree(smap);CHKERRQ(ierr); 167 ierr = PetscFree(lens);CHKERRQ(ierr); 168 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 169 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 170 171 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 172 *B = C; 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 179 { 180 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 181 IS is1; 182 PetscErrorCode ierr; 183 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 184 const PetscInt *irow; 185 186 PetscFunctionBegin; 187 if (isrow != iscol) { 188 PetscBool isequal; 189 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr); 190 if (!isequal) 191 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow"); 192 } 193 194 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); //isrow == iscol, so delete below!!! 195 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 196 197 /* Verify if the indices corespond to each element in a block 198 and form the IS with compressed IS -- also sort it!!! */ 199 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 200 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 201 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 202 203 count = 0; 204 for (i=0; i<a->mbs; i++) { 205 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 206 if (vary[i]==bs) iary[count++] = i; 207 } 208 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 209 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 210 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 211 212 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); //remove a is1!!! 213 ierr = ISDestroy(&is1);CHKERRQ(ierr); 214 PetscFunctionReturn(0); 215 } 216 217 #undef __FUNCT__ 218 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 219 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 220 { 221 PetscErrorCode ierr; 222 PetscInt i; 223 224 PetscFunctionBegin; 225 if (scall == MAT_INITIAL_MATRIX) { 226 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 227 } 228 229 for (i=0; i<n; i++) { 230 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 231 } 232 PetscFunctionReturn(0); 233 } 234 235 /* -------------------------------------------------------*/ 236 /* Should check that shapes of vectors and matrices match */ 237 /* -------------------------------------------------------*/ 238 239 #undef __FUNCT__ 240 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 241 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 242 { 243 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 244 PetscScalar *z,x1,x2,zero=0.0; 245 const PetscScalar *x,*xb; 246 const MatScalar *v; 247 PetscErrorCode ierr; 248 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 249 const PetscInt *aj=a->j,*ai=a->i,*ib; 250 PetscInt nonzerorow=0; 251 252 PetscFunctionBegin; 253 ierr = VecSet(zz,zero);CHKERRQ(ierr); 254 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 255 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 256 257 v = a->a; 258 xb = x; 259 260 for (i=0; i<mbs; i++) { 261 n = ai[1] - ai[0]; /* length of i_th block row of A */ 262 x1 = xb[0]; x2 = xb[1]; 263 ib = aj + *ai; 264 jmin = 0; 265 nonzerorow += (n>0); 266 if (*ib == i) { /* (diag of A)*x */ 267 z[2*i] += v[0]*x1 + v[2]*x2; 268 z[2*i+1] += v[2]*x1 + v[3]*x2; 269 v += 4; jmin++; 270 } 271 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 272 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 273 for (j=jmin; j<n; j++) { 274 /* (strict lower triangular part of A)*x */ 275 cval = ib[j]*2; 276 z[cval] += v[0]*x1 + v[1]*x2; 277 z[cval+1] += v[2]*x1 + v[3]*x2; 278 /* (strict upper triangular part of A)*x */ 279 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 280 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 281 v += 4; 282 } 283 xb +=2; ai++; 284 } 285 286 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 287 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 288 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 289 PetscFunctionReturn(0); 290 } 291 292 #undef __FUNCT__ 293 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 294 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 295 { 296 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 297 PetscScalar *z,x1,x2,x3,zero=0.0; 298 const PetscScalar *x,*xb; 299 const MatScalar *v; 300 PetscErrorCode ierr; 301 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 302 const PetscInt *aj = a->j,*ai = a->i,*ib; 303 PetscInt nonzerorow=0; 304 305 PetscFunctionBegin; 306 ierr = VecSet(zz,zero);CHKERRQ(ierr); 307 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 308 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 309 310 v = a->a; 311 xb = x; 312 313 for (i=0; i<mbs; i++) { 314 n = ai[1] - ai[0]; /* length of i_th block row of A */ 315 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 316 ib = aj + *ai; 317 jmin = 0; 318 nonzerorow += (n>0); 319 if (*ib == i) { /* (diag of A)*x */ 320 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 321 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 322 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 323 v += 9; jmin++; 324 } 325 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 326 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 327 for (j=jmin; j<n; j++) { 328 /* (strict lower triangular part of A)*x */ 329 cval = ib[j]*3; 330 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 331 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 332 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 333 /* (strict upper triangular part of A)*x */ 334 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 335 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 336 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 337 v += 9; 338 } 339 xb +=3; ai++; 340 } 341 342 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 343 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 344 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 350 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 351 { 352 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 353 PetscScalar *z,x1,x2,x3,x4,zero=0.0; 354 const PetscScalar *x,*xb; 355 const MatScalar *v; 356 PetscErrorCode ierr; 357 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 358 const PetscInt *aj = a->j,*ai = a->i,*ib; 359 PetscInt nonzerorow = 0; 360 361 PetscFunctionBegin; 362 ierr = VecSet(zz,zero);CHKERRQ(ierr); 363 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 364 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 365 366 v = a->a; 367 xb = x; 368 369 for (i=0; i<mbs; i++) { 370 n = ai[1] - ai[0]; /* length of i_th block row of A */ 371 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 372 ib = aj + *ai; 373 jmin = 0; 374 nonzerorow += (n>0); 375 if (*ib == i) { /* (diag of A)*x */ 376 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 377 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 378 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 379 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 380 v += 16; jmin++; 381 } 382 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 383 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 384 for (j=jmin; j<n; j++) { 385 /* (strict lower triangular part of A)*x */ 386 cval = ib[j]*4; 387 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 388 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 389 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 390 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 391 /* (strict upper triangular part of A)*x */ 392 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 393 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 394 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 395 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 396 v += 16; 397 } 398 xb +=4; ai++; 399 } 400 401 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 402 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 403 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 404 PetscFunctionReturn(0); 405 } 406 407 #undef __FUNCT__ 408 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 409 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 410 { 411 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 412 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0; 413 const PetscScalar *x,*xb; 414 const MatScalar *v; 415 PetscErrorCode ierr; 416 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 417 const PetscInt *aj = a->j,*ai = a->i,*ib; 418 PetscInt nonzerorow=0; 419 420 PetscFunctionBegin; 421 ierr = VecSet(zz,zero);CHKERRQ(ierr); 422 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 423 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 424 425 v = a->a; 426 xb = x; 427 428 for (i=0; i<mbs; i++) { 429 n = ai[1] - ai[0]; /* length of i_th block row of A */ 430 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 431 ib = aj + *ai; 432 jmin = 0; 433 nonzerorow += (n>0); 434 if (*ib == i) { /* (diag of A)*x */ 435 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 436 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 437 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 438 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 439 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 440 v += 25; jmin++; 441 } 442 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 443 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 444 for (j=jmin; j<n; j++) { 445 /* (strict lower triangular part of A)*x */ 446 cval = ib[j]*5; 447 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 448 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 449 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 450 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 451 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 452 /* (strict upper triangular part of A)*x */ 453 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 454 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 455 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 456 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 457 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 458 v += 25; 459 } 460 xb +=5; ai++; 461 } 462 463 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 464 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 465 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 466 PetscFunctionReturn(0); 467 } 468 469 470 #undef __FUNCT__ 471 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 472 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 473 { 474 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 475 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0; 476 const PetscScalar *x,*xb; 477 const MatScalar *v; 478 PetscErrorCode ierr; 479 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 480 const PetscInt *aj=a->j,*ai=a->i,*ib; 481 PetscInt nonzerorow=0; 482 483 PetscFunctionBegin; 484 ierr = VecSet(zz,zero);CHKERRQ(ierr); 485 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 486 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 487 488 v = a->a; 489 xb = x; 490 491 for (i=0; i<mbs; i++) { 492 n = ai[1] - ai[0]; /* length of i_th block row of A */ 493 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 494 ib = aj + *ai; 495 jmin = 0; 496 nonzerorow += (n>0); 497 if (*ib == i) { /* (diag of A)*x */ 498 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 499 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 500 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 501 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 502 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 503 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 504 v += 36; jmin++; 505 } 506 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 507 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 508 for (j=jmin; j<n; j++) { 509 /* (strict lower triangular part of A)*x */ 510 cval = ib[j]*6; 511 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 512 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 513 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 514 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 515 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 516 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 517 /* (strict upper triangular part of A)*x */ 518 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 519 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 520 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 521 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 522 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 523 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 524 v += 36; 525 } 526 xb +=6; ai++; 527 } 528 529 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 530 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 531 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 532 PetscFunctionReturn(0); 533 } 534 #undef __FUNCT__ 535 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 536 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 537 { 538 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 539 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 540 const PetscScalar *x,*xb; 541 const MatScalar *v; 542 PetscErrorCode ierr; 543 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 544 const PetscInt *aj=a->j,*ai=a->i,*ib; 545 PetscInt nonzerorow=0; 546 547 PetscFunctionBegin; 548 ierr = VecSet(zz,zero);CHKERRQ(ierr); 549 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 550 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 551 552 v = a->a; 553 xb = x; 554 555 for (i=0; i<mbs; i++) { 556 n = ai[1] - ai[0]; /* length of i_th block row of A */ 557 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 558 ib = aj + *ai; 559 jmin = 0; 560 nonzerorow += (n>0); 561 if (*ib == i) { /* (diag of A)*x */ 562 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 563 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 564 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 565 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 566 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 567 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 568 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 569 v += 49; jmin++; 570 } 571 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 572 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 573 for (j=jmin; j<n; j++) { 574 /* (strict lower triangular part of A)*x */ 575 cval = ib[j]*7; 576 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 577 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 578 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 579 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 580 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 581 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 582 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 583 /* (strict upper triangular part of A)*x */ 584 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 585 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 586 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 587 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 588 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 589 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 590 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 591 v += 49; 592 } 593 xb +=7; ai++; 594 } 595 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 596 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 597 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 598 PetscFunctionReturn(0); 599 } 600 601 /* 602 This will not work with MatScalar == float because it calls the BLAS 603 */ 604 #undef __FUNCT__ 605 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 606 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 607 { 608 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 609 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0; 610 const PetscScalar *x,*x_ptr,*xb; 611 const MatScalar *v; 612 PetscErrorCode ierr; 613 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 614 const PetscInt *idx,*aj,*ii; 615 PetscInt nonzerorow=0; 616 617 PetscFunctionBegin; 618 ierr = VecSet(zz,zero);CHKERRQ(ierr); 619 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x; 620 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 621 622 aj = a->j; 623 v = a->a; 624 ii = a->i; 625 626 if (!a->mult_work) { 627 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr); 628 } 629 work = a->mult_work; 630 631 for (i=0; i<mbs; i++) { 632 n = ii[1] - ii[0]; ncols = n*bs; 633 workt = work; idx=aj+ii[0]; 634 nonzerorow += (n>0); 635 636 /* upper triangular part */ 637 for (j=0; j<n; j++) { 638 xb = x_ptr + bs*(*idx++); 639 for (k=0; k<bs; k++) workt[k] = xb[k]; 640 workt += bs; 641 } 642 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 643 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 644 645 /* strict lower triangular part */ 646 idx = aj+ii[0]; 647 if (*idx == i) { 648 ncols -= bs; v += bs2; idx++; n--; 649 } 650 651 if (ncols > 0) { 652 workt = work; 653 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 654 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 655 for (j=0; j<n; j++) { 656 zb = z_ptr + bs*(*idx++); 657 for (k=0; k<bs; k++) zb[k] += workt[k]; 658 workt += bs; 659 } 660 } 661 x += bs; v += n*bs2; z += bs; ii++; 662 } 663 664 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 665 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 666 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 667 PetscFunctionReturn(0); 668 } 669 670 #undef __FUNCT__ 671 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 672 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 673 { 674 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 675 PetscScalar *z,x1; 676 const PetscScalar *x,*xb; 677 const MatScalar *v; 678 PetscErrorCode ierr; 679 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 680 const PetscInt *aj=a->j,*ai=a->i,*ib; 681 PetscInt nonzerorow=0; 682 683 PetscFunctionBegin; 684 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 685 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 686 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 687 v = a->a; 688 xb = x; 689 690 for (i=0; i<mbs; i++) { 691 n = ai[1] - ai[0]; /* length of i_th row of A */ 692 x1 = xb[0]; 693 ib = aj + *ai; 694 jmin = 0; 695 nonzerorow += (n>0); 696 if (*ib == i) { /* (diag of A)*x */ 697 z[i] += *v++ * x[*ib++]; jmin++; 698 } 699 for (j=jmin; j<n; j++) { 700 cval = *ib; 701 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 702 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 703 } 704 xb++; ai++; 705 } 706 707 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 708 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 709 710 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 711 PetscFunctionReturn(0); 712 } 713 714 #undef __FUNCT__ 715 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 716 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 717 { 718 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 719 PetscScalar *z,x1,x2; 720 const PetscScalar *x,*xb; 721 const MatScalar *v; 722 PetscErrorCode ierr; 723 PetscInt mbs =a->mbs,i,n,cval,j,jmin; 724 const PetscInt *aj=a->j,*ai=a->i,*ib; 725 PetscInt nonzerorow=0; 726 727 PetscFunctionBegin; 728 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 729 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 730 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 731 732 v = a->a; 733 xb = x; 734 735 for (i=0; i<mbs; i++) { 736 n = ai[1] - ai[0]; /* length of i_th block row of A */ 737 x1 = xb[0]; x2 = xb[1]; 738 ib = aj + *ai; 739 jmin = 0; 740 nonzerorow += (n>0); 741 if (*ib == i) { /* (diag of A)*x */ 742 z[2*i] += v[0]*x1 + v[2]*x2; 743 z[2*i+1] += v[2]*x1 + v[3]*x2; 744 v += 4; jmin++; 745 } 746 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 747 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 748 for (j=jmin; j<n; j++) { 749 /* (strict lower triangular part of A)*x */ 750 cval = ib[j]*2; 751 z[cval] += v[0]*x1 + v[1]*x2; 752 z[cval+1] += v[2]*x1 + v[3]*x2; 753 /* (strict upper triangular part of A)*x */ 754 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 755 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 756 v += 4; 757 } 758 xb +=2; ai++; 759 } 760 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 761 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 762 763 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 767 #undef __FUNCT__ 768 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 769 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 770 { 771 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 772 PetscScalar *z,x1,x2,x3; 773 const PetscScalar *x,*xb; 774 const MatScalar *v; 775 PetscErrorCode ierr; 776 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 777 const PetscInt *aj=a->j,*ai=a->i,*ib; 778 PetscInt nonzerorow=0; 779 780 PetscFunctionBegin; 781 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 782 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 783 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 784 785 v = a->a; 786 xb = x; 787 788 for (i=0; i<mbs; i++) { 789 n = ai[1] - ai[0]; /* length of i_th block row of A */ 790 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 791 ib = aj + *ai; 792 jmin = 0; 793 nonzerorow += (n>0); 794 if (*ib == i) { /* (diag of A)*x */ 795 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 796 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 797 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 798 v += 9; jmin++; 799 } 800 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 801 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 802 for (j=jmin; j<n; j++) { 803 /* (strict lower triangular part of A)*x */ 804 cval = ib[j]*3; 805 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 806 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 807 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 808 /* (strict upper triangular part of A)*x */ 809 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 810 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 811 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 812 v += 9; 813 } 814 xb +=3; ai++; 815 } 816 817 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 818 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 819 820 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 821 PetscFunctionReturn(0); 822 } 823 824 #undef __FUNCT__ 825 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 826 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 827 { 828 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 829 PetscScalar *z,x1,x2,x3,x4; 830 const PetscScalar *x,*xb; 831 const MatScalar *v; 832 PetscErrorCode ierr; 833 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 834 const PetscInt *aj=a->j,*ai=a->i,*ib; 835 PetscInt nonzerorow=0; 836 837 PetscFunctionBegin; 838 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 839 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 840 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 841 842 v = a->a; 843 xb = x; 844 845 for (i=0; i<mbs; i++) { 846 n = ai[1] - ai[0]; /* length of i_th block row of A */ 847 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 848 ib = aj + *ai; 849 jmin = 0; 850 nonzerorow += (n>0); 851 if (*ib == i) { /* (diag of A)*x */ 852 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 853 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 854 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 855 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 856 v += 16; jmin++; 857 } 858 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 859 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 860 for (j=jmin; j<n; j++) { 861 /* (strict lower triangular part of A)*x */ 862 cval = ib[j]*4; 863 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 864 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 865 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 866 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 867 /* (strict upper triangular part of A)*x */ 868 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 869 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 870 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 871 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 872 v += 16; 873 } 874 xb +=4; ai++; 875 } 876 877 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 878 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 879 880 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 881 PetscFunctionReturn(0); 882 } 883 884 #undef __FUNCT__ 885 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 886 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 887 { 888 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 889 PetscScalar *z,x1,x2,x3,x4,x5; 890 const PetscScalar *x,*xb; 891 const MatScalar *v; 892 PetscErrorCode ierr; 893 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 894 const PetscInt *aj=a->j,*ai=a->i,*ib; 895 PetscInt nonzerorow=0; 896 897 PetscFunctionBegin; 898 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 899 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 900 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 901 902 v = a->a; 903 xb = x; 904 905 for (i=0; i<mbs; i++) { 906 n = ai[1] - ai[0]; /* length of i_th block row of A */ 907 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 908 ib = aj + *ai; 909 jmin = 0; 910 nonzerorow += (n>0); 911 if (*ib == i) { /* (diag of A)*x */ 912 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 913 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 914 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 915 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 916 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 917 v += 25; jmin++; 918 } 919 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 920 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 921 for (j=jmin; j<n; j++) { 922 /* (strict lower triangular part of A)*x */ 923 cval = ib[j]*5; 924 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 925 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 926 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 927 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 928 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 929 /* (strict upper triangular part of A)*x */ 930 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 931 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 932 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 933 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 934 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 935 v += 25; 936 } 937 xb +=5; ai++; 938 } 939 940 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 941 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 942 943 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 944 PetscFunctionReturn(0); 945 } 946 #undef __FUNCT__ 947 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 948 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 949 { 950 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 951 PetscScalar *z,x1,x2,x3,x4,x5,x6; 952 const PetscScalar *x,*xb; 953 const MatScalar *v; 954 PetscErrorCode ierr; 955 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 956 const PetscInt *aj=a->j,*ai=a->i,*ib; 957 PetscInt nonzerorow=0; 958 959 PetscFunctionBegin; 960 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 961 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 962 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 963 964 v = a->a; 965 xb = x; 966 967 for (i=0; i<mbs; i++) { 968 n = ai[1] - ai[0]; /* length of i_th block row of A */ 969 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 970 ib = aj + *ai; 971 jmin = 0; 972 nonzerorow += (n>0); 973 if (*ib == i) { /* (diag of A)*x */ 974 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 975 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 976 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 977 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 978 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 979 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 980 v += 36; jmin++; 981 } 982 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 983 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 984 for (j=jmin; j<n; j++) { 985 /* (strict lower triangular part of A)*x */ 986 cval = ib[j]*6; 987 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 988 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 989 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 990 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 991 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 992 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 993 /* (strict upper triangular part of A)*x */ 994 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 995 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 996 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 997 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 998 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 999 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1000 v += 36; 1001 } 1002 xb +=6; ai++; 1003 } 1004 1005 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1006 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1007 1008 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1009 PetscFunctionReturn(0); 1010 } 1011 1012 #undef __FUNCT__ 1013 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1014 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1015 { 1016 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1017 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7; 1018 const PetscScalar *x,*xb; 1019 const MatScalar *v; 1020 PetscErrorCode ierr; 1021 PetscInt mbs = a->mbs,i,n,cval,j,jmin; 1022 const PetscInt *aj=a->j,*ai=a->i,*ib; 1023 PetscInt nonzerorow=0; 1024 1025 PetscFunctionBegin; 1026 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1027 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1028 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1029 1030 v = a->a; 1031 xb = x; 1032 1033 for (i=0; i<mbs; i++) { 1034 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1035 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1036 ib = aj + *ai; 1037 jmin = 0; 1038 nonzerorow += (n>0); 1039 if (*ib == i) { /* (diag of A)*x */ 1040 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1041 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1042 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1043 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1044 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1045 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1046 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1047 v += 49; jmin++; 1048 } 1049 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1050 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1051 for (j=jmin; j<n; j++) { 1052 /* (strict lower triangular part of A)*x */ 1053 cval = ib[j]*7; 1054 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1055 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1056 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1057 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1058 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1059 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1060 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1061 /* (strict upper triangular part of A)*x */ 1062 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1063 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1064 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1065 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1066 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1067 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1068 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1069 v += 49; 1070 } 1071 xb +=7; ai++; 1072 } 1073 1074 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1075 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1076 1077 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1078 PetscFunctionReturn(0); 1079 } 1080 1081 #undef __FUNCT__ 1082 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1083 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1084 { 1085 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1086 PetscScalar *z,*z_ptr=0,*zb,*work,*workt; 1087 const PetscScalar *x,*x_ptr,*xb; 1088 const MatScalar *v; 1089 PetscErrorCode ierr; 1090 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1091 const PetscInt *idx,*aj,*ii; 1092 PetscInt nonzerorow=0; 1093 1094 PetscFunctionBegin; 1095 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1096 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x; 1097 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1098 1099 aj = a->j; 1100 v = a->a; 1101 ii = a->i; 1102 1103 if (!a->mult_work) { 1104 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr); 1105 } 1106 work = a->mult_work; 1107 1108 1109 for (i=0; i<mbs; i++) { 1110 n = ii[1] - ii[0]; ncols = n*bs; 1111 workt = work; idx=aj+ii[0]; 1112 nonzerorow += (n>0); 1113 1114 /* upper triangular part */ 1115 for (j=0; j<n; j++) { 1116 xb = x_ptr + bs*(*idx++); 1117 for (k=0; k<bs; k++) workt[k] = xb[k]; 1118 workt += bs; 1119 } 1120 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1121 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1122 1123 /* strict lower triangular part */ 1124 idx = aj+ii[0]; 1125 if (*idx == i) { 1126 ncols -= bs; v += bs2; idx++; n--; 1127 } 1128 if (ncols > 0) { 1129 workt = work; 1130 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1131 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1132 for (j=0; j<n; j++) { 1133 zb = z_ptr + bs*(*idx++); 1134 for (k=0; k<bs; k++) zb[k] += workt[k]; 1135 workt += bs; 1136 } 1137 } 1138 1139 x += bs; v += n*bs2; z += bs; ii++; 1140 } 1141 1142 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1143 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1144 1145 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1146 PetscFunctionReturn(0); 1147 } 1148 1149 #undef __FUNCT__ 1150 #define __FUNCT__ "MatScale_SeqSBAIJ" 1151 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1152 { 1153 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1154 PetscScalar oalpha = alpha; 1155 PetscErrorCode ierr; 1156 PetscBLASInt one = 1,totalnz; 1157 1158 PetscFunctionBegin; 1159 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr); 1160 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one)); 1161 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1162 PetscFunctionReturn(0); 1163 } 1164 1165 #undef __FUNCT__ 1166 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1167 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1168 { 1169 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1170 const MatScalar *v = a->a; 1171 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1172 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il; 1173 PetscErrorCode ierr; 1174 const PetscInt *aj=a->j,*col; 1175 1176 PetscFunctionBegin; 1177 if (type == NORM_FROBENIUS) { 1178 for (k=0; k<mbs; k++) { 1179 jmin = a->i[k]; jmax = a->i[k+1]; 1180 col = aj + jmin; 1181 if (*col == k) { /* diagonal block */ 1182 for (i=0; i<bs2; i++) { 1183 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1184 } 1185 jmin++; 1186 } 1187 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */ 1188 for (i=0; i<bs2; i++) { 1189 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1190 } 1191 } 1192 } 1193 *norm = PetscSqrtReal(sum_diag + 2*sum_off); 1194 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1195 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr); 1196 for (i=0; i<mbs; i++) jl[i] = mbs; 1197 il[0] = 0; 1198 1199 *norm = 0.0; 1200 for (k=0; k<mbs; k++) { /* k_th block row */ 1201 for (j=0; j<bs; j++) sum[j]=0.0; 1202 /*-- col sum --*/ 1203 i = jl[k]; /* first |A(i,k)| to be added */ 1204 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1205 at step k */ 1206 while (i<mbs) { 1207 nexti = jl[i]; /* next block row to be added */ 1208 ik = il[i]; /* block index of A(i,k) in the array a */ 1209 for (j=0; j<bs; j++) { 1210 v = a->a + ik*bs2 + j*bs; 1211 for (k1=0; k1<bs; k1++) { 1212 sum[j] += PetscAbsScalar(*v); v++; 1213 } 1214 } 1215 /* update il, jl */ 1216 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1217 jmax = a->i[i+1]; 1218 if (jmin < jmax) { 1219 il[i] = jmin; 1220 j = a->j[jmin]; 1221 jl[i] = jl[j]; jl[j]=i; 1222 } 1223 i = nexti; 1224 } 1225 /*-- row sum --*/ 1226 jmin = a->i[k]; jmax = a->i[k+1]; 1227 for (i=jmin; i<jmax; i++) { 1228 for (j=0; j<bs; j++) { 1229 v = a->a + i*bs2 + j; 1230 for (k1=0; k1<bs; k1++) { 1231 sum[j] += PetscAbsScalar(*v); v += bs; 1232 } 1233 } 1234 } 1235 /* add k_th block row to il, jl */ 1236 col = aj+jmin; 1237 if (*col == k) jmin++; 1238 if (jmin < jmax) { 1239 il[k] = jmin; 1240 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1241 } 1242 for (j=0; j<bs; j++) { 1243 if (sum[j] > *norm) *norm = sum[j]; 1244 } 1245 } 1246 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1247 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1248 PetscFunctionReturn(0); 1249 } 1250 1251 #undef __FUNCT__ 1252 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1253 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1254 { 1255 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data; 1256 PetscErrorCode ierr; 1257 1258 PetscFunctionBegin; 1259 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1260 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1261 *flg = PETSC_FALSE; 1262 PetscFunctionReturn(0); 1263 } 1264 1265 /* if the a->i are the same */ 1266 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1267 if (!*flg) PetscFunctionReturn(0); 1268 1269 /* if a->j are the same */ 1270 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1271 if (!*flg) PetscFunctionReturn(0); 1272 1273 /* if a->a are the same */ 1274 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1280 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1281 { 1282 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1283 PetscErrorCode ierr; 1284 PetscInt i,j,k,row,bs,ambs,bs2; 1285 const PetscInt *ai,*aj; 1286 PetscScalar *x,zero = 0.0; 1287 const MatScalar *aa,*aa_j; 1288 1289 PetscFunctionBegin; 1290 bs = A->rmap->bs; 1291 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1292 1293 aa = a->a; 1294 ambs = a->mbs; 1295 1296 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) { 1297 PetscInt *diag=a->diag; 1298 aa = a->a; 1299 ambs = a->mbs; 1300 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1301 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]]; 1302 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1303 PetscFunctionReturn(0); 1304 } 1305 1306 ai = a->i; 1307 aj = a->j; 1308 bs2 = a->bs2; 1309 ierr = VecSet(v,zero);CHKERRQ(ierr); 1310 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1311 for (i=0; i<ambs; i++) { 1312 j=ai[i]; 1313 if (aj[j] == i) { /* if this is a diagonal element */ 1314 row = i*bs; 1315 aa_j = aa + j*bs2; 1316 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1317 } 1318 } 1319 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1320 PetscFunctionReturn(0); 1321 } 1322 1323 #undef __FUNCT__ 1324 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1325 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1326 { 1327 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1328 PetscScalar x; 1329 const PetscScalar *l,*li,*ri; 1330 MatScalar *aa,*v; 1331 PetscErrorCode ierr; 1332 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1333 PetscBool flg; 1334 1335 PetscFunctionBegin; 1336 if (ll != rr) { 1337 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1338 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1339 } 1340 if (!ll) PetscFunctionReturn(0); 1341 ai = a->i; 1342 aj = a->j; 1343 aa = a->a; 1344 m = A->rmap->N; 1345 bs = A->rmap->bs; 1346 mbs = a->mbs; 1347 bs2 = a->bs2; 1348 1349 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 1350 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1351 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1352 for (i=0; i<mbs; i++) { /* for each block row */ 1353 M = ai[i+1] - ai[i]; 1354 li = l + i*bs; 1355 v = aa + bs2*ai[i]; 1356 for (j=0; j<M; j++) { /* for each block */ 1357 ri = l + bs*aj[ai[i]+j]; 1358 for (k=0; k<bs; k++) { 1359 x = ri[k]; 1360 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1361 } 1362 } 1363 } 1364 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 1365 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1366 PetscFunctionReturn(0); 1367 } 1368 1369 #undef __FUNCT__ 1370 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1371 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1372 { 1373 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1374 1375 PetscFunctionBegin; 1376 info->block_size = a->bs2; 1377 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1378 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1379 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1380 info->assemblies = A->num_ass; 1381 info->mallocs = A->info.mallocs; 1382 info->memory = ((PetscObject)A)->mem; 1383 if (A->factortype) { 1384 info->fill_ratio_given = A->info.fill_ratio_given; 1385 info->fill_ratio_needed = A->info.fill_ratio_needed; 1386 info->factor_mallocs = A->info.factor_mallocs; 1387 } else { 1388 info->fill_ratio_given = 0; 1389 info->fill_ratio_needed = 0; 1390 info->factor_mallocs = 0; 1391 } 1392 PetscFunctionReturn(0); 1393 } 1394 1395 1396 #undef __FUNCT__ 1397 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1398 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1399 { 1400 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1401 PetscErrorCode ierr; 1402 1403 PetscFunctionBegin; 1404 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1405 PetscFunctionReturn(0); 1406 } 1407 1408 #undef __FUNCT__ 1409 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1410 /* 1411 This code does not work since it only checks the upper triangular part of 1412 the matrix. Hence it is not listed in the function table. 1413 */ 1414 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1415 { 1416 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1417 PetscErrorCode ierr; 1418 PetscInt i,j,n,row,col,bs,mbs; 1419 const PetscInt *ai,*aj; 1420 PetscReal atmp; 1421 const MatScalar *aa; 1422 PetscScalar *x; 1423 PetscInt ncols,brow,bcol,krow,kcol; 1424 1425 PetscFunctionBegin; 1426 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1427 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1428 bs = A->rmap->bs; 1429 aa = a->a; 1430 ai = a->i; 1431 aj = a->j; 1432 mbs = a->mbs; 1433 1434 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1435 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1436 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1437 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1438 for (i=0; i<mbs; i++) { 1439 ncols = ai[1] - ai[0]; ai++; 1440 brow = bs*i; 1441 for (j=0; j<ncols; j++) { 1442 bcol = bs*(*aj); 1443 for (kcol=0; kcol<bs; kcol++) { 1444 col = bcol + kcol; /* col index */ 1445 for (krow=0; krow<bs; krow++) { 1446 atmp = PetscAbsScalar(*aa); aa++; 1447 row = brow + krow; /* row index */ 1448 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1449 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1450 } 1451 } 1452 aj++; 1453 } 1454 } 1455 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1456 PetscFunctionReturn(0); 1457 } 1458