1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/inline/spops.h" 5 #include "../src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "../src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 const PetscInt *idx; 17 PetscBT table,table0; 18 19 PetscFunctionBegin; 20 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 21 mbs = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 26 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 27 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 28 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { /* for each is */ 31 isz = 0; 32 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 33 34 /* Extract the indices, assume there can be duplicate entries */ 35 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 36 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 37 38 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 39 bcol_max = 0; 40 for (j=0; j<n ; ++j){ 41 brow = idx[j]/bs; /* convert the indices into block indices */ 42 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,brow)) { 44 nidx[isz++] = brow; 45 if (bcol_max < brow) bcol_max = brow; 46 } 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++){ /* for each overlap */ 53 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 54 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 55 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 56 57 n = isz; /* length of the updated is[i] */ 58 for (brow=0; brow<mbs; brow++){ 59 start = ai[brow]; end = ai[brow+1]; 60 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 61 for (l = start; l<end ; l++){ 62 bcol = aj[l]; 63 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end ; l++){ 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table0,bcol)){ 72 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) 83 nidx2[j*bs+k] = nidx[j]*bs+k; 84 } 85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 86 } 87 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 88 ierr = PetscFree(nidx);CHKERRQ(ierr); 89 ierr = PetscFree(nidx2);CHKERRQ(ierr); 90 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 #undef __FUNCT__ 95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 97 { 98 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 99 PetscErrorCode ierr; 100 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 101 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 102 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 103 const PetscInt *irow,*aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscTruth flag,sorted; 107 108 PetscFunctionBegin; 109 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 110 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 111 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 117 ssmap = smap; 118 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 119 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 120 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 121 /* determine lens of each row */ 122 for (i=0; i<nrows; i++) { 123 kstart = ai[irow[i]]; 124 kend = kstart + a->ilen[irow[i]]; 125 lens[i] = 0; 126 for (k=kstart; k<kend; k++) { 127 if (ssmap[aj[k]]) { 128 lens[i]++; 129 } 130 } 131 } 132 /* Create and fill new matrix */ 133 if (scall == MAT_REUSE_MATRIX) { 134 c = (Mat_SeqSBAIJ *)((*B)->data); 135 136 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 137 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 138 if (!flag) { 139 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 140 } 141 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 142 C = *B; 143 } else { 144 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 145 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 146 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 147 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 148 } 149 c = (Mat_SeqSBAIJ *)(C->data); 150 for (i=0; i<nrows; i++) { 151 row = irow[i]; 152 kstart = ai[row]; 153 kend = kstart + a->ilen[row]; 154 mat_i = c->i[i]; 155 mat_j = c->j + mat_i; 156 mat_a = c->a + mat_i*bs2; 157 mat_ilen = c->ilen + i; 158 for (k=kstart; k<kend; k++) { 159 if ((tcol=ssmap[a->j[k]])) { 160 *mat_j++ = tcol - 1; 161 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 162 mat_a += bs2; 163 (*mat_ilen)++; 164 } 165 } 166 } 167 168 /* Free work space */ 169 ierr = PetscFree(smap);CHKERRQ(ierr); 170 ierr = PetscFree(lens);CHKERRQ(ierr); 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 174 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 175 *B = C; 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 182 { 183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 184 IS is1; 185 PetscErrorCode ierr; 186 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 187 const PetscInt *irow; 188 189 PetscFunctionBegin; 190 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 191 192 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 193 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 194 195 /* Verify if the indices corespond to each element in a block 196 and form the IS with compressed IS */ 197 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 198 iary = vary + a->mbs; 199 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 200 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 201 202 count = 0; 203 for (i=0; i<a->mbs; i++) { 204 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 205 if (vary[i]==bs) iary[count++] = i; 206 } 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 208 209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 210 ierr = PetscFree(vary);CHKERRQ(ierr); 211 212 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 213 ISDestroy(is1); 214 PetscFunctionReturn(0); 215 } 216 217 #undef __FUNCT__ 218 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 219 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 220 { 221 PetscErrorCode ierr; 222 PetscInt i; 223 224 PetscFunctionBegin; 225 if (scall == MAT_INITIAL_MATRIX) { 226 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 227 } 228 229 for (i=0; i<n; i++) { 230 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 231 } 232 PetscFunctionReturn(0); 233 } 234 235 /* -------------------------------------------------------*/ 236 /* Should check that shapes of vectors and matrices match */ 237 /* -------------------------------------------------------*/ 238 #include "petscblaslapack.h" 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 242 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 243 { 244 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 245 const PetscScalar *x; 246 PetscScalar *z,x1,sum; 247 const MatScalar *v; 248 PetscErrorCode ierr; 249 PetscInt mbs=a->mbs,i,j,nz; 250 const PetscInt *aj=a->j,*ai=a->i,*ib; 251 252 PetscFunctionBegin; 253 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 254 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 255 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 256 257 v = a->a; 258 ib = aj; 259 260 for (i=0; i<mbs; i++) { 261 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 262 x1 = x[i]; 263 sum = v[0]*x1; /* diagonal term */ 264 for (j=1; j<nz; j++) { 265 z[ib[j]] += v[j] * x1; /* (strict lower triangular part of A)*x */ 266 sum += v[j] * x[ib[j]]; /* (strict upper triangular part of A)*x */ 267 } 268 z[i] += sum; 269 v += nz; 270 ib += nz; 271 } 272 273 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 274 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 275 ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr); 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 281 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 284 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 285 MatScalar *v; 286 PetscErrorCode ierr; 287 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 288 PetscInt nonzerorow=0; 289 290 PetscFunctionBegin; 291 ierr = VecSet(zz,zero);CHKERRQ(ierr); 292 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 293 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 294 295 v = a->a; 296 xb = x; 297 298 for (i=0; i<mbs; i++) { 299 n = ai[1] - ai[0]; /* length of i_th block row of A */ 300 x1 = xb[0]; x2 = xb[1]; 301 ib = aj + *ai; 302 jmin = 0; 303 nonzerorow += (n>0); 304 if (*ib == i){ /* (diag of A)*x */ 305 z[2*i] += v[0]*x1 + v[2]*x2; 306 z[2*i+1] += v[2]*x1 + v[3]*x2; 307 v += 4; jmin++; 308 } 309 for (j=jmin; j<n; j++) { 310 /* (strict lower triangular part of A)*x */ 311 cval = ib[j]*2; 312 z[cval] += v[0]*x1 + v[1]*x2; 313 z[cval+1] += v[2]*x1 + v[3]*x2; 314 /* (strict upper triangular part of A)*x */ 315 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 316 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 317 v += 4; 318 } 319 xb +=2; ai++; 320 } 321 322 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 323 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 324 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 325 PetscFunctionReturn(0); 326 } 327 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 330 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 331 { 332 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 333 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 334 MatScalar *v; 335 PetscErrorCode ierr; 336 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 337 PetscInt nonzerorow=0; 338 339 PetscFunctionBegin; 340 ierr = VecSet(zz,zero);CHKERRQ(ierr); 341 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 342 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 343 344 v = a->a; 345 xb = x; 346 347 for (i=0; i<mbs; i++) { 348 n = ai[1] - ai[0]; /* length of i_th block row of A */ 349 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 350 ib = aj + *ai; 351 jmin = 0; 352 nonzerorow += (n>0); 353 if (*ib == i){ /* (diag of A)*x */ 354 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 355 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 356 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 357 v += 9; jmin++; 358 } 359 for (j=jmin; j<n; j++) { 360 /* (strict lower triangular part of A)*x */ 361 cval = ib[j]*3; 362 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 363 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 364 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 365 /* (strict upper triangular part of A)*x */ 366 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 367 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 368 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 369 v += 9; 370 } 371 xb +=3; ai++; 372 } 373 374 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 375 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 376 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 #undef __FUNCT__ 381 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 382 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 383 { 384 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 385 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 386 MatScalar *v; 387 PetscErrorCode ierr; 388 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 389 PetscInt nonzerorow=0; 390 391 PetscFunctionBegin; 392 ierr = VecSet(zz,zero);CHKERRQ(ierr); 393 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 394 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 395 396 v = a->a; 397 xb = x; 398 399 for (i=0; i<mbs; i++) { 400 n = ai[1] - ai[0]; /* length of i_th block row of A */ 401 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 402 ib = aj + *ai; 403 jmin = 0; 404 nonzerorow += (n>0); 405 if (*ib == i){ /* (diag of A)*x */ 406 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 407 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 408 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 409 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 410 v += 16; jmin++; 411 } 412 for (j=jmin; j<n; j++) { 413 /* (strict lower triangular part of A)*x */ 414 cval = ib[j]*4; 415 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 416 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 417 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 418 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 419 /* (strict upper triangular part of A)*x */ 420 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 421 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 422 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 423 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 424 v += 16; 425 } 426 xb +=4; ai++; 427 } 428 429 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 430 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 431 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 432 PetscFunctionReturn(0); 433 } 434 435 #undef __FUNCT__ 436 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 437 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 438 { 439 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 440 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 441 MatScalar *v; 442 PetscErrorCode ierr; 443 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 444 PetscInt nonzerorow=0; 445 446 PetscFunctionBegin; 447 ierr = VecSet(zz,zero);CHKERRQ(ierr); 448 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 449 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 450 451 v = a->a; 452 xb = x; 453 454 for (i=0; i<mbs; i++) { 455 n = ai[1] - ai[0]; /* length of i_th block row of A */ 456 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 457 ib = aj + *ai; 458 jmin = 0; 459 nonzerorow += (n>0); 460 if (*ib == i){ /* (diag of A)*x */ 461 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 462 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 463 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 464 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 465 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 466 v += 25; jmin++; 467 } 468 for (j=jmin; j<n; j++) { 469 /* (strict lower triangular part of A)*x */ 470 cval = ib[j]*5; 471 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 472 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 473 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 474 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 475 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 476 /* (strict upper triangular part of A)*x */ 477 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 478 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 479 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 480 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 481 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 482 v += 25; 483 } 484 xb +=5; ai++; 485 } 486 487 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 488 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 489 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 490 PetscFunctionReturn(0); 491 } 492 493 494 #undef __FUNCT__ 495 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 496 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 497 { 498 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 499 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 500 MatScalar *v; 501 PetscErrorCode ierr; 502 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 503 PetscInt nonzerorow=0; 504 505 PetscFunctionBegin; 506 ierr = VecSet(zz,zero);CHKERRQ(ierr); 507 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 508 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 509 510 v = a->a; 511 xb = x; 512 513 for (i=0; i<mbs; i++) { 514 n = ai[1] - ai[0]; /* length of i_th block row of A */ 515 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 516 ib = aj + *ai; 517 jmin = 0; 518 nonzerorow += (n>0); 519 if (*ib == i){ /* (diag of A)*x */ 520 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 521 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 522 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 523 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 524 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 525 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 526 v += 36; jmin++; 527 } 528 for (j=jmin; j<n; j++) { 529 /* (strict lower triangular part of A)*x */ 530 cval = ib[j]*6; 531 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 532 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 533 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 534 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 535 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 536 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 537 /* (strict upper triangular part of A)*x */ 538 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 539 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 540 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 541 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 542 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 543 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 544 v += 36; 545 } 546 xb +=6; ai++; 547 } 548 549 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 550 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 551 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 552 PetscFunctionReturn(0); 553 } 554 #undef __FUNCT__ 555 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 556 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 557 { 558 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 559 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 560 MatScalar *v; 561 PetscErrorCode ierr; 562 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 563 PetscInt nonzerorow=0; 564 565 PetscFunctionBegin; 566 ierr = VecSet(zz,zero);CHKERRQ(ierr); 567 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 568 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 569 570 v = a->a; 571 xb = x; 572 573 for (i=0; i<mbs; i++) { 574 n = ai[1] - ai[0]; /* length of i_th block row of A */ 575 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 576 ib = aj + *ai; 577 jmin = 0; 578 nonzerorow += (n>0); 579 if (*ib == i){ /* (diag of A)*x */ 580 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 581 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 582 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 583 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 584 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 585 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 586 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 587 v += 49; jmin++; 588 } 589 for (j=jmin; j<n; j++) { 590 /* (strict lower triangular part of A)*x */ 591 cval = ib[j]*7; 592 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 593 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 594 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 595 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 596 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 597 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 598 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 599 /* (strict upper triangular part of A)*x */ 600 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 601 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 602 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 603 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 604 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 605 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 606 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 607 v += 49; 608 } 609 xb +=7; ai++; 610 } 611 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 612 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 613 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 614 PetscFunctionReturn(0); 615 } 616 617 /* 618 This will not work with MatScalar == float because it calls the BLAS 619 */ 620 #undef __FUNCT__ 621 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 622 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 623 { 624 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 625 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 626 MatScalar *v; 627 PetscErrorCode ierr; 628 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 629 PetscInt nonzerorow=0; 630 631 PetscFunctionBegin; 632 ierr = VecSet(zz,zero);CHKERRQ(ierr); 633 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 634 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 635 636 aj = a->j; 637 v = a->a; 638 ii = a->i; 639 640 if (!a->mult_work) { 641 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 642 } 643 work = a->mult_work; 644 645 for (i=0; i<mbs; i++) { 646 n = ii[1] - ii[0]; ncols = n*bs; 647 workt = work; idx=aj+ii[0]; 648 nonzerorow += (n>0); 649 650 /* upper triangular part */ 651 for (j=0; j<n; j++) { 652 xb = x_ptr + bs*(*idx++); 653 for (k=0; k<bs; k++) workt[k] = xb[k]; 654 workt += bs; 655 } 656 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 657 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 658 659 /* strict lower triangular part */ 660 idx = aj+ii[0]; 661 if (*idx == i){ 662 ncols -= bs; v += bs2; idx++; n--; 663 } 664 665 if (ncols > 0){ 666 workt = work; 667 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 668 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 669 for (j=0; j<n; j++) { 670 zb = z_ptr + bs*(*idx++); 671 for (k=0; k<bs; k++) zb[k] += workt[k] ; 672 workt += bs; 673 } 674 } 675 x += bs; v += n*bs2; z += bs; ii++; 676 } 677 678 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 679 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 680 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 681 PetscFunctionReturn(0); 682 } 683 684 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 685 #undef __FUNCT__ 686 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 687 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 688 { 689 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 690 PetscScalar *x,*z,*xb,x1; 691 MatScalar *v; 692 PetscErrorCode ierr; 693 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 694 PetscInt nonzerorow=0; 695 696 PetscFunctionBegin; 697 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 698 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 699 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 700 v = a->a; 701 xb = x; 702 703 for (i=0; i<mbs; i++) { 704 n = ai[1] - ai[0]; /* length of i_th row of A */ 705 x1 = xb[0]; 706 ib = aj + *ai; 707 jmin = 0; 708 nonzerorow += (n>0); 709 if (*ib == i) { /* (diag of A)*x */ 710 z[i] += *v++ * x[*ib++]; jmin++; 711 } 712 for (j=jmin; j<n; j++) { 713 cval = *ib; 714 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 715 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 716 } 717 xb++; ai++; 718 } 719 720 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 721 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 722 723 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 #undef __FUNCT__ 728 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 729 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 730 { 731 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 732 PetscScalar *x,*z,*xb,x1,x2; 733 MatScalar *v; 734 PetscErrorCode ierr; 735 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 736 PetscInt nonzerorow=0; 737 738 PetscFunctionBegin; 739 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 740 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 741 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 742 743 v = a->a; 744 xb = x; 745 746 for (i=0; i<mbs; i++) { 747 n = ai[1] - ai[0]; /* length of i_th block row of A */ 748 x1 = xb[0]; x2 = xb[1]; 749 ib = aj + *ai; 750 jmin = 0; 751 nonzerorow += (n>0); 752 if (*ib == i){ /* (diag of A)*x */ 753 z[2*i] += v[0]*x1 + v[2]*x2; 754 z[2*i+1] += v[2]*x1 + v[3]*x2; 755 v += 4; jmin++; 756 } 757 for (j=jmin; j<n; j++) { 758 /* (strict lower triangular part of A)*x */ 759 cval = ib[j]*2; 760 z[cval] += v[0]*x1 + v[1]*x2; 761 z[cval+1] += v[2]*x1 + v[3]*x2; 762 /* (strict upper triangular part of A)*x */ 763 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 764 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 765 v += 4; 766 } 767 xb +=2; ai++; 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 772 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 #undef __FUNCT__ 777 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 778 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 779 { 780 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 781 PetscScalar *x,*z,*xb,x1,x2,x3; 782 MatScalar *v; 783 PetscErrorCode ierr; 784 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 785 PetscInt nonzerorow=0; 786 787 PetscFunctionBegin; 788 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 789 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 790 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 791 792 v = a->a; 793 xb = x; 794 795 for (i=0; i<mbs; i++) { 796 n = ai[1] - ai[0]; /* length of i_th block row of A */ 797 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 798 ib = aj + *ai; 799 jmin = 0; 800 nonzerorow += (n>0); 801 if (*ib == i){ /* (diag of A)*x */ 802 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 803 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 804 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 805 v += 9; jmin++; 806 } 807 for (j=jmin; j<n; j++) { 808 /* (strict lower triangular part of A)*x */ 809 cval = ib[j]*3; 810 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 811 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 812 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 813 /* (strict upper triangular part of A)*x */ 814 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 815 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 816 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 817 v += 9; 818 } 819 xb +=3; ai++; 820 } 821 822 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 823 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 824 825 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 826 PetscFunctionReturn(0); 827 } 828 829 #undef __FUNCT__ 830 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 831 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 832 { 833 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 834 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 835 MatScalar *v; 836 PetscErrorCode ierr; 837 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 838 PetscInt nonzerorow=0; 839 840 PetscFunctionBegin; 841 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 842 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 843 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 844 845 v = a->a; 846 xb = x; 847 848 for (i=0; i<mbs; i++) { 849 n = ai[1] - ai[0]; /* length of i_th block row of A */ 850 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 851 ib = aj + *ai; 852 jmin = 0; 853 nonzerorow += (n>0); 854 if (*ib == i){ /* (diag of A)*x */ 855 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 856 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 857 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 858 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 859 v += 16; jmin++; 860 } 861 for (j=jmin; j<n; j++) { 862 /* (strict lower triangular part of A)*x */ 863 cval = ib[j]*4; 864 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 865 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 866 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 867 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 868 /* (strict upper triangular part of A)*x */ 869 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 870 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 871 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 872 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 873 v += 16; 874 } 875 xb +=4; ai++; 876 } 877 878 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 879 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 880 881 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 882 PetscFunctionReturn(0); 883 } 884 885 #undef __FUNCT__ 886 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 887 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 888 { 889 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 890 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 891 MatScalar *v; 892 PetscErrorCode ierr; 893 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 894 PetscInt nonzerorow=0; 895 896 PetscFunctionBegin; 897 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 898 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 899 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 900 901 v = a->a; 902 xb = x; 903 904 for (i=0; i<mbs; i++) { 905 n = ai[1] - ai[0]; /* length of i_th block row of A */ 906 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 907 ib = aj + *ai; 908 jmin = 0; 909 nonzerorow += (n>0); 910 if (*ib == i){ /* (diag of A)*x */ 911 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 912 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 913 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 914 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 915 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 916 v += 25; jmin++; 917 } 918 for (j=jmin; j<n; j++) { 919 /* (strict lower triangular part of A)*x */ 920 cval = ib[j]*5; 921 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 922 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 923 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 924 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 925 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 926 /* (strict upper triangular part of A)*x */ 927 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 928 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 929 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 930 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 931 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 932 v += 25; 933 } 934 xb +=5; ai++; 935 } 936 937 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 938 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 939 940 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 941 PetscFunctionReturn(0); 942 } 943 #undef __FUNCT__ 944 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 945 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 946 { 947 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 948 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 949 MatScalar *v; 950 PetscErrorCode ierr; 951 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 952 PetscInt nonzerorow=0; 953 954 PetscFunctionBegin; 955 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 956 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 957 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 958 959 v = a->a; 960 xb = x; 961 962 for (i=0; i<mbs; i++) { 963 n = ai[1] - ai[0]; /* length of i_th block row of A */ 964 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 965 ib = aj + *ai; 966 jmin = 0; 967 nonzerorow += (n>0); 968 if (*ib == i){ /* (diag of A)*x */ 969 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 970 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 971 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 972 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 973 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 974 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 975 v += 36; jmin++; 976 } 977 for (j=jmin; j<n; j++) { 978 /* (strict lower triangular part of A)*x */ 979 cval = ib[j]*6; 980 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 981 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 982 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 983 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 984 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 985 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 986 /* (strict upper triangular part of A)*x */ 987 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 988 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 989 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 990 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 991 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 992 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 993 v += 36; 994 } 995 xb +=6; ai++; 996 } 997 998 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 999 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1000 1001 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1007 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1008 { 1009 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1010 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1011 MatScalar *v; 1012 PetscErrorCode ierr; 1013 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1014 PetscInt nonzerorow=0; 1015 1016 PetscFunctionBegin; 1017 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1018 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1019 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1020 1021 v = a->a; 1022 xb = x; 1023 1024 for (i=0; i<mbs; i++) { 1025 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1026 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1027 ib = aj + *ai; 1028 jmin = 0; 1029 nonzerorow += (n>0); 1030 if (*ib == i){ /* (diag of A)*x */ 1031 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1032 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1033 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1034 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1035 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1036 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1037 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1038 v += 49; jmin++; 1039 } 1040 for (j=jmin; j<n; j++) { 1041 /* (strict lower triangular part of A)*x */ 1042 cval = ib[j]*7; 1043 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1044 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1045 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1046 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1047 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1048 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1049 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1050 /* (strict upper triangular part of A)*x */ 1051 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1052 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1053 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1054 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1055 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1056 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1057 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1058 v += 49; 1059 } 1060 xb +=7; ai++; 1061 } 1062 1063 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1064 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1065 1066 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1067 PetscFunctionReturn(0); 1068 } 1069 1070 #undef __FUNCT__ 1071 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1072 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1073 { 1074 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1075 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1076 MatScalar *v; 1077 PetscErrorCode ierr; 1078 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1079 PetscInt nonzerorow=0; 1080 1081 PetscFunctionBegin; 1082 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1083 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1084 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1085 1086 aj = a->j; 1087 v = a->a; 1088 ii = a->i; 1089 1090 if (!a->mult_work) { 1091 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1092 } 1093 work = a->mult_work; 1094 1095 1096 for (i=0; i<mbs; i++) { 1097 n = ii[1] - ii[0]; ncols = n*bs; 1098 workt = work; idx=aj+ii[0]; 1099 nonzerorow += (n>0); 1100 1101 /* upper triangular part */ 1102 for (j=0; j<n; j++) { 1103 xb = x_ptr + bs*(*idx++); 1104 for (k=0; k<bs; k++) workt[k] = xb[k]; 1105 workt += bs; 1106 } 1107 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1108 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1109 1110 /* strict lower triangular part */ 1111 idx = aj+ii[0]; 1112 if (*idx == i){ 1113 ncols -= bs; v += bs2; idx++; n--; 1114 } 1115 if (ncols > 0){ 1116 workt = work; 1117 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1118 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1119 for (j=0; j<n; j++) { 1120 zb = z_ptr + bs*(*idx++); 1121 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1122 workt += bs; 1123 } 1124 } 1125 1126 x += bs; v += n*bs2; z += bs; ii++; 1127 } 1128 1129 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1130 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1131 1132 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1133 PetscFunctionReturn(0); 1134 } 1135 1136 #undef __FUNCT__ 1137 #define __FUNCT__ "MatScale_SeqSBAIJ" 1138 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1139 { 1140 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1141 PetscScalar oalpha = alpha; 1142 PetscErrorCode ierr; 1143 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1144 1145 PetscFunctionBegin; 1146 BLASscal_(&totalnz,&oalpha,a->a,&one); 1147 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 #undef __FUNCT__ 1152 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1153 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1154 { 1155 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1156 MatScalar *v = a->a; 1157 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1158 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1159 PetscErrorCode ierr; 1160 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1161 1162 PetscFunctionBegin; 1163 if (type == NORM_FROBENIUS) { 1164 for (k=0; k<mbs; k++){ 1165 jmin = a->i[k]; jmax = a->i[k+1]; 1166 col = aj + jmin; 1167 if (*col == k){ /* diagonal block */ 1168 for (i=0; i<bs2; i++){ 1169 #if defined(PETSC_USE_COMPLEX) 1170 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1171 #else 1172 sum_diag += (*v)*(*v); v++; 1173 #endif 1174 } 1175 jmin++; 1176 } 1177 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1178 for (i=0; i<bs2; i++){ 1179 #if defined(PETSC_USE_COMPLEX) 1180 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1181 #else 1182 sum_off += (*v)*(*v); v++; 1183 #endif 1184 } 1185 } 1186 } 1187 *norm = sqrt(sum_diag + 2*sum_off); 1188 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1189 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1190 jl = il + mbs; 1191 sum = (PetscReal*)(jl + mbs); 1192 for (i=0; i<mbs; i++) jl[i] = mbs; 1193 il[0] = 0; 1194 1195 *norm = 0.0; 1196 for (k=0; k<mbs; k++) { /* k_th block row */ 1197 for (j=0; j<bs; j++) sum[j]=0.0; 1198 /*-- col sum --*/ 1199 i = jl[k]; /* first |A(i,k)| to be added */ 1200 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1201 at step k */ 1202 while (i<mbs){ 1203 nexti = jl[i]; /* next block row to be added */ 1204 ik = il[i]; /* block index of A(i,k) in the array a */ 1205 for (j=0; j<bs; j++){ 1206 v = a->a + ik*bs2 + j*bs; 1207 for (k1=0; k1<bs; k1++) { 1208 sum[j] += PetscAbsScalar(*v); v++; 1209 } 1210 } 1211 /* update il, jl */ 1212 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1213 jmax = a->i[i+1]; 1214 if (jmin < jmax){ 1215 il[i] = jmin; 1216 j = a->j[jmin]; 1217 jl[i] = jl[j]; jl[j]=i; 1218 } 1219 i = nexti; 1220 } 1221 /*-- row sum --*/ 1222 jmin = a->i[k]; jmax = a->i[k+1]; 1223 for (i=jmin; i<jmax; i++) { 1224 for (j=0; j<bs; j++){ 1225 v = a->a + i*bs2 + j; 1226 for (k1=0; k1<bs; k1++){ 1227 sum[j] += PetscAbsScalar(*v); v += bs; 1228 } 1229 } 1230 } 1231 /* add k_th block row to il, jl */ 1232 col = aj+jmin; 1233 if (*col == k) jmin++; 1234 if (jmin < jmax){ 1235 il[k] = jmin; 1236 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1237 } 1238 for (j=0; j<bs; j++){ 1239 if (sum[j] > *norm) *norm = sum[j]; 1240 } 1241 } 1242 ierr = PetscFree(il);CHKERRQ(ierr); 1243 } else { 1244 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1245 } 1246 PetscFunctionReturn(0); 1247 } 1248 1249 #undef __FUNCT__ 1250 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1251 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1252 { 1253 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1254 PetscErrorCode ierr; 1255 1256 PetscFunctionBegin; 1257 1258 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1259 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1260 *flg = PETSC_FALSE; 1261 PetscFunctionReturn(0); 1262 } 1263 1264 /* if the a->i are the same */ 1265 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1266 if (!*flg) { 1267 PetscFunctionReturn(0); 1268 } 1269 1270 /* if a->j are the same */ 1271 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1272 if (!*flg) { 1273 PetscFunctionReturn(0); 1274 } 1275 /* if a->a are the same */ 1276 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1277 PetscFunctionReturn(0); 1278 } 1279 1280 #undef __FUNCT__ 1281 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1282 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1283 { 1284 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1285 PetscErrorCode ierr; 1286 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1287 PetscScalar *x,zero = 0.0; 1288 MatScalar *aa,*aa_j; 1289 1290 PetscFunctionBegin; 1291 bs = A->rmap->bs; 1292 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1293 1294 aa = a->a; 1295 ai = a->i; 1296 aj = a->j; 1297 ambs = a->mbs; 1298 bs2 = a->bs2; 1299 1300 ierr = VecSet(v,zero);CHKERRQ(ierr); 1301 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1302 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1303 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1304 for (i=0; i<ambs; i++) { 1305 j=ai[i]; 1306 if (aj[j] == i) { /* if this is a diagonal element */ 1307 row = i*bs; 1308 aa_j = aa + j*bs2; 1309 if (A->factor && bs==1){ 1310 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1311 } else { 1312 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1313 } 1314 } 1315 } 1316 1317 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1318 PetscFunctionReturn(0); 1319 } 1320 1321 #undef __FUNCT__ 1322 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1323 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1324 { 1325 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1326 PetscScalar *l,x,*li,*ri; 1327 MatScalar *aa,*v; 1328 PetscErrorCode ierr; 1329 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1330 PetscTruth flg; 1331 1332 PetscFunctionBegin; 1333 if (ll != rr){ 1334 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1335 if (!flg) 1336 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1337 } 1338 if (!ll) PetscFunctionReturn(0); 1339 ai = a->i; 1340 aj = a->j; 1341 aa = a->a; 1342 m = A->rmap->N; 1343 bs = A->rmap->bs; 1344 mbs = a->mbs; 1345 bs2 = a->bs2; 1346 1347 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1348 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1349 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1350 for (i=0; i<mbs; i++) { /* for each block row */ 1351 M = ai[i+1] - ai[i]; 1352 li = l + i*bs; 1353 v = aa + bs2*ai[i]; 1354 for (j=0; j<M; j++) { /* for each block */ 1355 ri = l + bs*aj[ai[i]+j]; 1356 for (k=0; k<bs; k++) { 1357 x = ri[k]; 1358 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1359 } 1360 } 1361 } 1362 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1363 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1364 PetscFunctionReturn(0); 1365 } 1366 1367 #undef __FUNCT__ 1368 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1369 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1370 { 1371 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1372 1373 PetscFunctionBegin; 1374 info->block_size = a->bs2; 1375 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1376 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1377 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1378 info->assemblies = A->num_ass; 1379 info->mallocs = a->reallocs; 1380 info->memory = ((PetscObject)A)->mem; 1381 if (A->factor) { 1382 info->fill_ratio_given = A->info.fill_ratio_given; 1383 info->fill_ratio_needed = A->info.fill_ratio_needed; 1384 info->factor_mallocs = A->info.factor_mallocs; 1385 } else { 1386 info->fill_ratio_given = 0; 1387 info->fill_ratio_needed = 0; 1388 info->factor_mallocs = 0; 1389 } 1390 PetscFunctionReturn(0); 1391 } 1392 1393 1394 #undef __FUNCT__ 1395 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1396 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1397 { 1398 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1399 PetscErrorCode ierr; 1400 1401 PetscFunctionBegin; 1402 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1403 PetscFunctionReturn(0); 1404 } 1405 1406 #undef __FUNCT__ 1407 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1408 /* 1409 This code does not work since it only checks the upper triangular part of 1410 the matrix. Hence it is not listed in the function table. 1411 */ 1412 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1413 { 1414 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1415 PetscErrorCode ierr; 1416 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1417 PetscReal atmp; 1418 MatScalar *aa; 1419 PetscScalar *x; 1420 PetscInt ncols,brow,bcol,krow,kcol; 1421 1422 PetscFunctionBegin; 1423 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1424 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1425 bs = A->rmap->bs; 1426 aa = a->a; 1427 ai = a->i; 1428 aj = a->j; 1429 mbs = a->mbs; 1430 1431 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1432 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1433 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1434 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1435 for (i=0; i<mbs; i++) { 1436 ncols = ai[1] - ai[0]; ai++; 1437 brow = bs*i; 1438 for (j=0; j<ncols; j++){ 1439 bcol = bs*(*aj); 1440 for (kcol=0; kcol<bs; kcol++){ 1441 col = bcol + kcol; /* col index */ 1442 for (krow=0; krow<bs; krow++){ 1443 atmp = PetscAbsScalar(*aa); aa++; 1444 row = brow + krow; /* row index */ 1445 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1446 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1447 } 1448 } 1449 aj++; 1450 } 1451 } 1452 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1453 PetscFunctionReturn(0); 1454 } 1455