1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "../src/mat/impls/sbaij/seq/sbaij.h" 7 #include "petscblaslapack.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 const PetscInt *idx; 17 PetscBT table,table0; 18 19 PetscFunctionBegin; 20 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 21 mbs = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 26 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 27 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 28 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { /* for each is */ 31 isz = 0; 32 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 33 34 /* Extract the indices, assume there can be duplicate entries */ 35 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 36 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 37 38 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 39 bcol_max = 0; 40 for (j=0; j<n ; ++j){ 41 brow = idx[j]/bs; /* convert the indices into block indices */ 42 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,brow)) { 44 nidx[isz++] = brow; 45 if (bcol_max < brow) bcol_max = brow; 46 } 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++){ /* for each overlap */ 53 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 54 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 55 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 56 57 n = isz; /* length of the updated is[i] */ 58 for (brow=0; brow<mbs; brow++){ 59 start = ai[brow]; end = ai[brow+1]; 60 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 61 for (l = start; l<end ; l++){ 62 bcol = aj[l]; 63 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end ; l++){ 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table0,bcol)){ 72 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) 83 nidx2[j*bs+k] = nidx[j]*bs+k; 84 } 85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 86 } 87 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 88 ierr = PetscFree(nidx);CHKERRQ(ierr); 89 ierr = PetscFree(nidx2);CHKERRQ(ierr); 90 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 #undef __FUNCT__ 95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 97 { 98 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 99 PetscErrorCode ierr; 100 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 101 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 102 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 103 const PetscInt *irow,*aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscBool flag,sorted; 107 108 PetscFunctionBegin; 109 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 110 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 111 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 117 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 120 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 121 /* determine lens of each row */ 122 for (i=0; i<nrows; i++) { 123 kstart = ai[irow[i]]; 124 kend = kstart + a->ilen[irow[i]]; 125 lens[i] = 0; 126 for (k=kstart; k<kend; k++) { 127 if (ssmap[aj[k]]) { 128 lens[i]++; 129 } 130 } 131 } 132 /* Create and fill new matrix */ 133 if (scall == MAT_REUSE_MATRIX) { 134 c = (Mat_SeqSBAIJ *)((*B)->data); 135 136 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 137 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 138 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 140 C = *B; 141 } else { 142 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 143 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 144 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 145 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 146 } 147 c = (Mat_SeqSBAIJ *)(C->data); 148 for (i=0; i<nrows; i++) { 149 row = irow[i]; 150 kstart = ai[row]; 151 kend = kstart + a->ilen[row]; 152 mat_i = c->i[i]; 153 mat_j = c->j + mat_i; 154 mat_a = c->a + mat_i*bs2; 155 mat_ilen = c->ilen + i; 156 for (k=kstart; k<kend; k++) { 157 if ((tcol=ssmap[a->j[k]])) { 158 *mat_j++ = tcol - 1; 159 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 160 mat_a += bs2; 161 (*mat_ilen)++; 162 } 163 } 164 } 165 166 /* Free work space */ 167 ierr = PetscFree(smap);CHKERRQ(ierr); 168 ierr = PetscFree(lens);CHKERRQ(ierr); 169 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 170 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 172 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 173 *B = C; 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 179 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 180 { 181 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 182 IS is1; 183 PetscErrorCode ierr; 184 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 185 const PetscInt *irow; 186 187 PetscFunctionBegin; 188 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 189 190 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 192 193 /* Verify if the indices corespond to each element in a block 194 and form the IS with compressed IS */ 195 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 196 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 197 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 198 199 count = 0; 200 for (i=0; i<a->mbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 202 if (vary[i]==bs) iary[count++] = i; 203 } 204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 206 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 207 208 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 209 ierr = ISDestroy(is1);CHKERRQ(ierr); 210 PetscFunctionReturn(0); 211 } 212 213 #undef __FUNCT__ 214 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 215 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 216 { 217 PetscErrorCode ierr; 218 PetscInt i; 219 220 PetscFunctionBegin; 221 if (scall == MAT_INITIAL_MATRIX) { 222 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 223 } 224 225 for (i=0; i<n; i++) { 226 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 /* -------------------------------------------------------*/ 232 /* Should check that shapes of vectors and matrices match */ 233 /* -------------------------------------------------------*/ 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 237 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 238 { 239 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 240 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 241 MatScalar *v; 242 PetscErrorCode ierr; 243 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 244 PetscInt nonzerorow=0; 245 246 PetscFunctionBegin; 247 ierr = VecSet(zz,zero);CHKERRQ(ierr); 248 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 250 251 v = a->a; 252 xb = x; 253 254 for (i=0; i<mbs; i++) { 255 n = ai[1] - ai[0]; /* length of i_th block row of A */ 256 x1 = xb[0]; x2 = xb[1]; 257 ib = aj + *ai; 258 jmin = 0; 259 nonzerorow += (n>0); 260 if (*ib == i){ /* (diag of A)*x */ 261 z[2*i] += v[0]*x1 + v[2]*x2; 262 z[2*i+1] += v[2]*x1 + v[3]*x2; 263 v += 4; jmin++; 264 } 265 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 266 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 267 for (j=jmin; j<n; j++) { 268 /* (strict lower triangular part of A)*x */ 269 cval = ib[j]*2; 270 z[cval] += v[0]*x1 + v[1]*x2; 271 z[cval+1] += v[2]*x1 + v[3]*x2; 272 /* (strict upper triangular part of A)*x */ 273 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 274 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 275 v += 4; 276 } 277 xb +=2; ai++; 278 } 279 280 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 281 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 282 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 283 PetscFunctionReturn(0); 284 } 285 286 #undef __FUNCT__ 287 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 288 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 289 { 290 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 291 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 292 MatScalar *v; 293 PetscErrorCode ierr; 294 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 295 PetscInt nonzerorow=0; 296 297 PetscFunctionBegin; 298 ierr = VecSet(zz,zero);CHKERRQ(ierr); 299 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 300 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 301 302 v = a->a; 303 xb = x; 304 305 for (i=0; i<mbs; i++) { 306 n = ai[1] - ai[0]; /* length of i_th block row of A */ 307 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 308 ib = aj + *ai; 309 jmin = 0; 310 nonzerorow += (n>0); 311 if (*ib == i){ /* (diag of A)*x */ 312 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 313 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 314 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 315 v += 9; jmin++; 316 } 317 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 318 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 319 for (j=jmin; j<n; j++) { 320 /* (strict lower triangular part of A)*x */ 321 cval = ib[j]*3; 322 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 323 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 324 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 325 /* (strict upper triangular part of A)*x */ 326 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 327 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 328 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 329 v += 9; 330 } 331 xb +=3; ai++; 332 } 333 334 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 335 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 336 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 337 PetscFunctionReturn(0); 338 } 339 340 #undef __FUNCT__ 341 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 342 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 343 { 344 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 345 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 346 MatScalar *v; 347 PetscErrorCode ierr; 348 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 349 PetscInt nonzerorow=0; 350 351 PetscFunctionBegin; 352 ierr = VecSet(zz,zero);CHKERRQ(ierr); 353 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 354 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 355 356 v = a->a; 357 xb = x; 358 359 for (i=0; i<mbs; i++) { 360 n = ai[1] - ai[0]; /* length of i_th block row of A */ 361 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 362 ib = aj + *ai; 363 jmin = 0; 364 nonzerorow += (n>0); 365 if (*ib == i){ /* (diag of A)*x */ 366 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 367 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 368 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 369 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 370 v += 16; jmin++; 371 } 372 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 373 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 374 for (j=jmin; j<n; j++) { 375 /* (strict lower triangular part of A)*x */ 376 cval = ib[j]*4; 377 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 378 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 379 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 380 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 381 /* (strict upper triangular part of A)*x */ 382 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 383 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 384 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 385 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 386 v += 16; 387 } 388 xb +=4; ai++; 389 } 390 391 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 392 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 393 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 394 PetscFunctionReturn(0); 395 } 396 397 #undef __FUNCT__ 398 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 399 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 400 { 401 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 402 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 403 MatScalar *v; 404 PetscErrorCode ierr; 405 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 406 PetscInt nonzerorow=0; 407 408 PetscFunctionBegin; 409 ierr = VecSet(zz,zero);CHKERRQ(ierr); 410 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 411 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 412 413 v = a->a; 414 xb = x; 415 416 for (i=0; i<mbs; i++) { 417 n = ai[1] - ai[0]; /* length of i_th block row of A */ 418 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 419 ib = aj + *ai; 420 jmin = 0; 421 nonzerorow += (n>0); 422 if (*ib == i){ /* (diag of A)*x */ 423 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 424 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 425 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 426 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 427 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 428 v += 25; jmin++; 429 } 430 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 431 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 432 for (j=jmin; j<n; j++) { 433 /* (strict lower triangular part of A)*x */ 434 cval = ib[j]*5; 435 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 436 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 437 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 438 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 439 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 440 /* (strict upper triangular part of A)*x */ 441 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 442 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 443 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 444 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 445 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 446 v += 25; 447 } 448 xb +=5; ai++; 449 } 450 451 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 452 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 453 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 454 PetscFunctionReturn(0); 455 } 456 457 458 #undef __FUNCT__ 459 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 460 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 461 { 462 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 463 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 464 MatScalar *v; 465 PetscErrorCode ierr; 466 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 467 PetscInt nonzerorow=0; 468 469 PetscFunctionBegin; 470 ierr = VecSet(zz,zero);CHKERRQ(ierr); 471 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 472 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 473 474 v = a->a; 475 xb = x; 476 477 for (i=0; i<mbs; i++) { 478 n = ai[1] - ai[0]; /* length of i_th block row of A */ 479 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 480 ib = aj + *ai; 481 jmin = 0; 482 nonzerorow += (n>0); 483 if (*ib == i){ /* (diag of A)*x */ 484 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 485 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 486 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 487 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 488 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 489 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 490 v += 36; jmin++; 491 } 492 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 493 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 494 for (j=jmin; j<n; j++) { 495 /* (strict lower triangular part of A)*x */ 496 cval = ib[j]*6; 497 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 498 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 499 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 500 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 501 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 502 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 503 /* (strict upper triangular part of A)*x */ 504 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 505 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 506 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 507 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 508 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 509 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 510 v += 36; 511 } 512 xb +=6; ai++; 513 } 514 515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 516 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 517 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 518 PetscFunctionReturn(0); 519 } 520 #undef __FUNCT__ 521 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 522 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 523 { 524 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 525 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 526 MatScalar *v; 527 PetscErrorCode ierr; 528 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 529 PetscInt nonzerorow=0; 530 531 PetscFunctionBegin; 532 ierr = VecSet(zz,zero);CHKERRQ(ierr); 533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 534 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 535 536 v = a->a; 537 xb = x; 538 539 for (i=0; i<mbs; i++) { 540 n = ai[1] - ai[0]; /* length of i_th block row of A */ 541 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 542 ib = aj + *ai; 543 jmin = 0; 544 nonzerorow += (n>0); 545 if (*ib == i){ /* (diag of A)*x */ 546 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 547 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 548 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 549 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 550 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 551 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 552 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 553 v += 49; jmin++; 554 } 555 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 556 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 557 for (j=jmin; j<n; j++) { 558 /* (strict lower triangular part of A)*x */ 559 cval = ib[j]*7; 560 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 561 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 562 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 563 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 564 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 565 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 566 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 567 /* (strict upper triangular part of A)*x */ 568 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 569 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 570 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 571 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 572 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 573 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 574 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 575 v += 49; 576 } 577 xb +=7; ai++; 578 } 579 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 581 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 585 /* 586 This will not work with MatScalar == float because it calls the BLAS 587 */ 588 #undef __FUNCT__ 589 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 590 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 591 { 592 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 593 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 594 MatScalar *v; 595 PetscErrorCode ierr; 596 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 597 PetscInt nonzerorow=0; 598 599 PetscFunctionBegin; 600 ierr = VecSet(zz,zero);CHKERRQ(ierr); 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 603 604 aj = a->j; 605 v = a->a; 606 ii = a->i; 607 608 if (!a->mult_work) { 609 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 610 } 611 work = a->mult_work; 612 613 for (i=0; i<mbs; i++) { 614 n = ii[1] - ii[0]; ncols = n*bs; 615 workt = work; idx=aj+ii[0]; 616 nonzerorow += (n>0); 617 618 /* upper triangular part */ 619 for (j=0; j<n; j++) { 620 xb = x_ptr + bs*(*idx++); 621 for (k=0; k<bs; k++) workt[k] = xb[k]; 622 workt += bs; 623 } 624 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 625 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 626 627 /* strict lower triangular part */ 628 idx = aj+ii[0]; 629 if (*idx == i){ 630 ncols -= bs; v += bs2; idx++; n--; 631 } 632 633 if (ncols > 0){ 634 workt = work; 635 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 636 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 637 for (j=0; j<n; j++) { 638 zb = z_ptr + bs*(*idx++); 639 for (k=0; k<bs; k++) zb[k] += workt[k] ; 640 workt += bs; 641 } 642 } 643 x += bs; v += n*bs2; z += bs; ii++; 644 } 645 646 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 647 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 648 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 649 PetscFunctionReturn(0); 650 } 651 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 654 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 655 { 656 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 657 PetscScalar *x,*z,*xb,x1; 658 MatScalar *v; 659 PetscErrorCode ierr; 660 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 661 PetscInt nonzerorow=0; 662 663 PetscFunctionBegin; 664 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 665 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 666 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 667 v = a->a; 668 xb = x; 669 670 for (i=0; i<mbs; i++) { 671 n = ai[1] - ai[0]; /* length of i_th row of A */ 672 x1 = xb[0]; 673 ib = aj + *ai; 674 jmin = 0; 675 nonzerorow += (n>0); 676 if (*ib == i) { /* (diag of A)*x */ 677 z[i] += *v++ * x[*ib++]; jmin++; 678 } 679 for (j=jmin; j<n; j++) { 680 cval = *ib; 681 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 682 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 683 } 684 xb++; ai++; 685 } 686 687 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 688 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 689 690 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 691 PetscFunctionReturn(0); 692 } 693 694 #undef __FUNCT__ 695 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 696 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 697 { 698 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 699 PetscScalar *x,*z,*xb,x1,x2; 700 MatScalar *v; 701 PetscErrorCode ierr; 702 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 703 PetscInt nonzerorow=0; 704 705 PetscFunctionBegin; 706 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 707 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 708 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 709 710 v = a->a; 711 xb = x; 712 713 for (i=0; i<mbs; i++) { 714 n = ai[1] - ai[0]; /* length of i_th block row of A */ 715 x1 = xb[0]; x2 = xb[1]; 716 ib = aj + *ai; 717 jmin = 0; 718 nonzerorow += (n>0); 719 if (*ib == i){ /* (diag of A)*x */ 720 z[2*i] += v[0]*x1 + v[2]*x2; 721 z[2*i+1] += v[2]*x1 + v[3]*x2; 722 v += 4; jmin++; 723 } 724 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 725 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 726 for (j=jmin; j<n; j++) { 727 /* (strict lower triangular part of A)*x */ 728 cval = ib[j]*2; 729 z[cval] += v[0]*x1 + v[1]*x2; 730 z[cval+1] += v[2]*x1 + v[3]*x2; 731 /* (strict upper triangular part of A)*x */ 732 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 733 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 734 v += 4; 735 } 736 xb +=2; ai++; 737 } 738 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 739 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 740 741 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 742 PetscFunctionReturn(0); 743 } 744 745 #undef __FUNCT__ 746 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 747 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 748 { 749 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 750 PetscScalar *x,*z,*xb,x1,x2,x3; 751 MatScalar *v; 752 PetscErrorCode ierr; 753 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 754 PetscInt nonzerorow=0; 755 756 PetscFunctionBegin; 757 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 758 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 759 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 760 761 v = a->a; 762 xb = x; 763 764 for (i=0; i<mbs; i++) { 765 n = ai[1] - ai[0]; /* length of i_th block row of A */ 766 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 767 ib = aj + *ai; 768 jmin = 0; 769 nonzerorow += (n>0); 770 if (*ib == i){ /* (diag of A)*x */ 771 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 772 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 773 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 774 v += 9; jmin++; 775 } 776 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 777 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 778 for (j=jmin; j<n; j++) { 779 /* (strict lower triangular part of A)*x */ 780 cval = ib[j]*3; 781 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 782 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 783 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 784 /* (strict upper triangular part of A)*x */ 785 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 786 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 787 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 788 v += 9; 789 } 790 xb +=3; ai++; 791 } 792 793 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 794 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 795 796 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 797 PetscFunctionReturn(0); 798 } 799 800 #undef __FUNCT__ 801 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 802 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 803 { 804 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 805 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 806 MatScalar *v; 807 PetscErrorCode ierr; 808 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 809 PetscInt nonzerorow=0; 810 811 PetscFunctionBegin; 812 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 815 816 v = a->a; 817 xb = x; 818 819 for (i=0; i<mbs; i++) { 820 n = ai[1] - ai[0]; /* length of i_th block row of A */ 821 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 822 ib = aj + *ai; 823 jmin = 0; 824 nonzerorow += (n>0); 825 if (*ib == i){ /* (diag of A)*x */ 826 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 827 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 828 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 829 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 830 v += 16; jmin++; 831 } 832 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 833 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 834 for (j=jmin; j<n; j++) { 835 /* (strict lower triangular part of A)*x */ 836 cval = ib[j]*4; 837 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 838 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 839 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 840 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 841 /* (strict upper triangular part of A)*x */ 842 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 843 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 844 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 845 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 846 v += 16; 847 } 848 xb +=4; ai++; 849 } 850 851 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 852 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 853 854 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 855 PetscFunctionReturn(0); 856 } 857 858 #undef __FUNCT__ 859 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 860 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 861 { 862 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 863 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 864 MatScalar *v; 865 PetscErrorCode ierr; 866 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 867 PetscInt nonzerorow=0; 868 869 PetscFunctionBegin; 870 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 871 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 872 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 873 874 v = a->a; 875 xb = x; 876 877 for (i=0; i<mbs; i++) { 878 n = ai[1] - ai[0]; /* length of i_th block row of A */ 879 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 880 ib = aj + *ai; 881 jmin = 0; 882 nonzerorow += (n>0); 883 if (*ib == i){ /* (diag of A)*x */ 884 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 885 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 886 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 887 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 888 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 889 v += 25; jmin++; 890 } 891 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 892 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 893 for (j=jmin; j<n; j++) { 894 /* (strict lower triangular part of A)*x */ 895 cval = ib[j]*5; 896 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 897 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 898 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 899 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 900 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 901 /* (strict upper triangular part of A)*x */ 902 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 903 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 904 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 905 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 906 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 907 v += 25; 908 } 909 xb +=5; ai++; 910 } 911 912 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 913 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 914 915 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 916 PetscFunctionReturn(0); 917 } 918 #undef __FUNCT__ 919 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 920 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 921 { 922 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 923 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 924 MatScalar *v; 925 PetscErrorCode ierr; 926 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 927 PetscInt nonzerorow=0; 928 929 PetscFunctionBegin; 930 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 931 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 932 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 933 934 v = a->a; 935 xb = x; 936 937 for (i=0; i<mbs; i++) { 938 n = ai[1] - ai[0]; /* length of i_th block row of A */ 939 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 940 ib = aj + *ai; 941 jmin = 0; 942 nonzerorow += (n>0); 943 if (*ib == i){ /* (diag of A)*x */ 944 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 945 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 946 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 947 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 948 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 949 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 950 v += 36; jmin++; 951 } 952 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 953 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 954 for (j=jmin; j<n; j++) { 955 /* (strict lower triangular part of A)*x */ 956 cval = ib[j]*6; 957 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 958 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 959 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 960 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 961 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 962 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 963 /* (strict upper triangular part of A)*x */ 964 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 965 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 966 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 967 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 968 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 969 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 970 v += 36; 971 } 972 xb +=6; ai++; 973 } 974 975 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 976 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 977 978 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 979 PetscFunctionReturn(0); 980 } 981 982 #undef __FUNCT__ 983 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 984 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 985 { 986 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 987 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 988 MatScalar *v; 989 PetscErrorCode ierr; 990 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 991 PetscInt nonzerorow=0; 992 993 PetscFunctionBegin; 994 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 995 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 996 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 997 998 v = a->a; 999 xb = x; 1000 1001 for (i=0; i<mbs; i++) { 1002 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1003 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1004 ib = aj + *ai; 1005 jmin = 0; 1006 nonzerorow += (n>0); 1007 if (*ib == i){ /* (diag of A)*x */ 1008 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1009 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1010 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1011 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1012 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1013 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1014 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1015 v += 49; jmin++; 1016 } 1017 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1018 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1019 for (j=jmin; j<n; j++) { 1020 /* (strict lower triangular part of A)*x */ 1021 cval = ib[j]*7; 1022 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1023 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1024 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1025 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1026 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1027 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1028 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1029 /* (strict upper triangular part of A)*x */ 1030 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1031 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1032 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1033 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1034 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1035 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1036 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1037 v += 49; 1038 } 1039 xb +=7; ai++; 1040 } 1041 1042 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1043 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1044 1045 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1046 PetscFunctionReturn(0); 1047 } 1048 1049 #undef __FUNCT__ 1050 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1051 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1052 { 1053 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1054 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1055 MatScalar *v; 1056 PetscErrorCode ierr; 1057 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1058 PetscInt nonzerorow=0; 1059 1060 PetscFunctionBegin; 1061 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1062 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1063 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1064 1065 aj = a->j; 1066 v = a->a; 1067 ii = a->i; 1068 1069 if (!a->mult_work) { 1070 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1071 } 1072 work = a->mult_work; 1073 1074 1075 for (i=0; i<mbs; i++) { 1076 n = ii[1] - ii[0]; ncols = n*bs; 1077 workt = work; idx=aj+ii[0]; 1078 nonzerorow += (n>0); 1079 1080 /* upper triangular part */ 1081 for (j=0; j<n; j++) { 1082 xb = x_ptr + bs*(*idx++); 1083 for (k=0; k<bs; k++) workt[k] = xb[k]; 1084 workt += bs; 1085 } 1086 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1087 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1088 1089 /* strict lower triangular part */ 1090 idx = aj+ii[0]; 1091 if (*idx == i){ 1092 ncols -= bs; v += bs2; idx++; n--; 1093 } 1094 if (ncols > 0){ 1095 workt = work; 1096 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1097 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1098 for (j=0; j<n; j++) { 1099 zb = z_ptr + bs*(*idx++); 1100 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1101 workt += bs; 1102 } 1103 } 1104 1105 x += bs; v += n*bs2; z += bs; ii++; 1106 } 1107 1108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1109 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1110 1111 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1112 PetscFunctionReturn(0); 1113 } 1114 1115 #undef __FUNCT__ 1116 #define __FUNCT__ "MatScale_SeqSBAIJ" 1117 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1118 { 1119 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1120 PetscScalar oalpha = alpha; 1121 PetscErrorCode ierr; 1122 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1123 1124 PetscFunctionBegin; 1125 BLASscal_(&totalnz,&oalpha,a->a,&one); 1126 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1127 PetscFunctionReturn(0); 1128 } 1129 1130 #undef __FUNCT__ 1131 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1132 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1133 { 1134 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1135 MatScalar *v = a->a; 1136 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1137 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1138 PetscErrorCode ierr; 1139 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1140 1141 PetscFunctionBegin; 1142 if (type == NORM_FROBENIUS) { 1143 for (k=0; k<mbs; k++){ 1144 jmin = a->i[k]; jmax = a->i[k+1]; 1145 col = aj + jmin; 1146 if (*col == k){ /* diagonal block */ 1147 for (i=0; i<bs2; i++){ 1148 #if defined(PETSC_USE_COMPLEX) 1149 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1150 #else 1151 sum_diag += (*v)*(*v); v++; 1152 #endif 1153 } 1154 jmin++; 1155 } 1156 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1157 for (i=0; i<bs2; i++){ 1158 #if defined(PETSC_USE_COMPLEX) 1159 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1160 #else 1161 sum_off += (*v)*(*v); v++; 1162 #endif 1163 } 1164 } 1165 } 1166 *norm = sqrt(sum_diag + 2*sum_off); 1167 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1168 ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 1169 for (i=0; i<mbs; i++) jl[i] = mbs; 1170 il[0] = 0; 1171 1172 *norm = 0.0; 1173 for (k=0; k<mbs; k++) { /* k_th block row */ 1174 for (j=0; j<bs; j++) sum[j]=0.0; 1175 /*-- col sum --*/ 1176 i = jl[k]; /* first |A(i,k)| to be added */ 1177 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1178 at step k */ 1179 while (i<mbs){ 1180 nexti = jl[i]; /* next block row to be added */ 1181 ik = il[i]; /* block index of A(i,k) in the array a */ 1182 for (j=0; j<bs; j++){ 1183 v = a->a + ik*bs2 + j*bs; 1184 for (k1=0; k1<bs; k1++) { 1185 sum[j] += PetscAbsScalar(*v); v++; 1186 } 1187 } 1188 /* update il, jl */ 1189 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1190 jmax = a->i[i+1]; 1191 if (jmin < jmax){ 1192 il[i] = jmin; 1193 j = a->j[jmin]; 1194 jl[i] = jl[j]; jl[j]=i; 1195 } 1196 i = nexti; 1197 } 1198 /*-- row sum --*/ 1199 jmin = a->i[k]; jmax = a->i[k+1]; 1200 for (i=jmin; i<jmax; i++) { 1201 for (j=0; j<bs; j++){ 1202 v = a->a + i*bs2 + j; 1203 for (k1=0; k1<bs; k1++){ 1204 sum[j] += PetscAbsScalar(*v); v += bs; 1205 } 1206 } 1207 } 1208 /* add k_th block row to il, jl */ 1209 col = aj+jmin; 1210 if (*col == k) jmin++; 1211 if (jmin < jmax){ 1212 il[k] = jmin; 1213 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1214 } 1215 for (j=0; j<bs; j++){ 1216 if (sum[j] > *norm) *norm = sum[j]; 1217 } 1218 } 1219 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1220 } else { 1221 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1222 } 1223 PetscFunctionReturn(0); 1224 } 1225 1226 #undef __FUNCT__ 1227 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1228 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1229 { 1230 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1231 PetscErrorCode ierr; 1232 1233 PetscFunctionBegin; 1234 1235 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1236 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1237 *flg = PETSC_FALSE; 1238 PetscFunctionReturn(0); 1239 } 1240 1241 /* if the a->i are the same */ 1242 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1243 if (!*flg) { 1244 PetscFunctionReturn(0); 1245 } 1246 1247 /* if a->j are the same */ 1248 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1249 if (!*flg) { 1250 PetscFunctionReturn(0); 1251 } 1252 /* if a->a are the same */ 1253 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 #undef __FUNCT__ 1258 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1259 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1260 { 1261 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1262 PetscErrorCode ierr; 1263 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1264 PetscScalar *x,zero = 0.0; 1265 MatScalar *aa,*aa_j; 1266 1267 PetscFunctionBegin; 1268 bs = A->rmap->bs; 1269 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1270 1271 aa = a->a; 1272 ai = a->i; 1273 aj = a->j; 1274 ambs = a->mbs; 1275 bs2 = a->bs2; 1276 1277 ierr = VecSet(v,zero);CHKERRQ(ierr); 1278 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1279 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1280 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1281 for (i=0; i<ambs; i++) { 1282 j=ai[i]; 1283 if (aj[j] == i) { /* if this is a diagonal element */ 1284 row = i*bs; 1285 aa_j = aa + j*bs2; 1286 if (A->factortype && bs==1){ 1287 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1288 } else { 1289 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1290 } 1291 } 1292 } 1293 1294 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1295 PetscFunctionReturn(0); 1296 } 1297 1298 #undef __FUNCT__ 1299 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1300 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1301 { 1302 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1303 PetscScalar *l,x,*li,*ri; 1304 MatScalar *aa,*v; 1305 PetscErrorCode ierr; 1306 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1307 PetscBool flg; 1308 1309 PetscFunctionBegin; 1310 if (ll != rr){ 1311 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1312 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1313 } 1314 if (!ll) PetscFunctionReturn(0); 1315 ai = a->i; 1316 aj = a->j; 1317 aa = a->a; 1318 m = A->rmap->N; 1319 bs = A->rmap->bs; 1320 mbs = a->mbs; 1321 bs2 = a->bs2; 1322 1323 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1324 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1325 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1326 for (i=0; i<mbs; i++) { /* for each block row */ 1327 M = ai[i+1] - ai[i]; 1328 li = l + i*bs; 1329 v = aa + bs2*ai[i]; 1330 for (j=0; j<M; j++) { /* for each block */ 1331 ri = l + bs*aj[ai[i]+j]; 1332 for (k=0; k<bs; k++) { 1333 x = ri[k]; 1334 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1335 } 1336 } 1337 } 1338 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1339 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1340 PetscFunctionReturn(0); 1341 } 1342 1343 #undef __FUNCT__ 1344 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1345 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1346 { 1347 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1348 1349 PetscFunctionBegin; 1350 info->block_size = a->bs2; 1351 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1352 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1353 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1354 info->assemblies = A->num_ass; 1355 info->mallocs = A->info.mallocs; 1356 info->memory = ((PetscObject)A)->mem; 1357 if (A->factortype) { 1358 info->fill_ratio_given = A->info.fill_ratio_given; 1359 info->fill_ratio_needed = A->info.fill_ratio_needed; 1360 info->factor_mallocs = A->info.factor_mallocs; 1361 } else { 1362 info->fill_ratio_given = 0; 1363 info->fill_ratio_needed = 0; 1364 info->factor_mallocs = 0; 1365 } 1366 PetscFunctionReturn(0); 1367 } 1368 1369 1370 #undef __FUNCT__ 1371 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1372 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1373 { 1374 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1375 PetscErrorCode ierr; 1376 1377 PetscFunctionBegin; 1378 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 #undef __FUNCT__ 1383 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1384 /* 1385 This code does not work since it only checks the upper triangular part of 1386 the matrix. Hence it is not listed in the function table. 1387 */ 1388 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1389 { 1390 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1391 PetscErrorCode ierr; 1392 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1393 PetscReal atmp; 1394 MatScalar *aa; 1395 PetscScalar *x; 1396 PetscInt ncols,brow,bcol,krow,kcol; 1397 1398 PetscFunctionBegin; 1399 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1400 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1401 bs = A->rmap->bs; 1402 aa = a->a; 1403 ai = a->i; 1404 aj = a->j; 1405 mbs = a->mbs; 1406 1407 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1408 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1409 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1410 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1411 for (i=0; i<mbs; i++) { 1412 ncols = ai[1] - ai[0]; ai++; 1413 brow = bs*i; 1414 for (j=0; j<ncols; j++){ 1415 bcol = bs*(*aj); 1416 for (kcol=0; kcol<bs; kcol++){ 1417 col = bcol + kcol; /* col index */ 1418 for (krow=0; krow<bs; krow++){ 1419 atmp = PetscAbsScalar(*aa); aa++; 1420 row = brow + krow; /* row index */ 1421 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1422 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1423 } 1424 } 1425 aj++; 1426 } 1427 } 1428 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1429 PetscFunctionReturn(0); 1430 } 1431