1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "../src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 102 const PetscInt *irow,*aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag,sorted; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 110 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ssmap = smap; 117 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 144 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 145 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 146 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 147 } 148 c = (Mat_SeqSBAIJ *)(C->data); 149 for (i=0; i<nrows; i++) { 150 row = irow[i]; 151 kstart = ai[row]; 152 kend = kstart + a->ilen[row]; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 mat_ilen = c->ilen + i; 157 for (k=kstart; k<kend; k++) { 158 if ((tcol=ssmap[a->j[k]])) { 159 *mat_j++ = tcol - 1; 160 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 161 mat_a += bs2; 162 (*mat_ilen)++; 163 } 164 } 165 } 166 167 /* Free work space */ 168 ierr = PetscFree(smap);CHKERRQ(ierr); 169 ierr = PetscFree(lens);CHKERRQ(ierr); 170 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 173 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 174 *B = C; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 181 { 182 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 183 IS is1; 184 PetscErrorCode ierr; 185 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 186 const PetscInt *irow; 187 188 PetscFunctionBegin; 189 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 190 191 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 192 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 193 194 /* Verify if the indices corespond to each element in a block 195 and form the IS with compressed IS */ 196 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 197 iary = vary + a->mbs; 198 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 199 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 200 201 count = 0; 202 for (i=0; i<a->mbs; i++) { 203 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 204 if (vary[i]==bs) iary[count++] = i; 205 } 206 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 207 208 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 209 ierr = PetscFree(vary);CHKERRQ(ierr); 210 211 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 212 ISDestroy(is1); 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 219 { 220 PetscErrorCode ierr; 221 PetscInt i; 222 223 PetscFunctionBegin; 224 if (scall == MAT_INITIAL_MATRIX) { 225 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 226 } 227 228 for (i=0; i<n; i++) { 229 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 230 } 231 PetscFunctionReturn(0); 232 } 233 234 /* -------------------------------------------------------*/ 235 /* Should check that shapes of vectors and matrices match */ 236 /* -------------------------------------------------------*/ 237 #include "petscblaslapack.h" 238 239 #undef __FUNCT__ 240 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 241 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 242 { 243 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 244 const PetscScalar *x; 245 PetscScalar *z,x1,sum; 246 const MatScalar *v; 247 PetscErrorCode ierr; 248 PetscInt mbs=a->mbs,i,j,nz; 249 const PetscInt *aj=a->j,*ai=a->i,*ib; 250 251 PetscFunctionBegin; 252 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 253 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 254 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 255 256 v = a->a; 257 ib = aj; 258 259 for (i=0; i<mbs; i++) { 260 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 261 x1 = x[i]; 262 sum = v[0]*x1; /* diagonal term */ 263 for (j=1; j<nz; j++) { 264 z[ib[j]] += v[j] * x1; /* (strict lower triangular part of A)*x */ 265 sum += v[j] * x[ib[j]]; /* (strict upper triangular part of A)*x */ 266 } 267 z[i] += sum; 268 v += nz; 269 ib += nz; 270 } 271 272 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 273 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 274 ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr); 275 PetscFunctionReturn(0); 276 } 277 278 #undef __FUNCT__ 279 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 280 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 281 { 282 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 283 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 284 MatScalar *v; 285 PetscErrorCode ierr; 286 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 287 PetscInt nonzerorow=0; 288 289 PetscFunctionBegin; 290 ierr = VecSet(zz,zero);CHKERRQ(ierr); 291 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 292 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 293 294 v = a->a; 295 xb = x; 296 297 for (i=0; i<mbs; i++) { 298 n = ai[1] - ai[0]; /* length of i_th block row of A */ 299 x1 = xb[0]; x2 = xb[1]; 300 ib = aj + *ai; 301 jmin = 0; 302 nonzerorow += (n>0); 303 if (*ib == i){ /* (diag of A)*x */ 304 z[2*i] += v[0]*x1 + v[2]*x2; 305 z[2*i+1] += v[2]*x1 + v[3]*x2; 306 v += 4; jmin++; 307 } 308 for (j=jmin; j<n; j++) { 309 /* (strict lower triangular part of A)*x */ 310 cval = ib[j]*2; 311 z[cval] += v[0]*x1 + v[1]*x2; 312 z[cval+1] += v[2]*x1 + v[3]*x2; 313 /* (strict upper triangular part of A)*x */ 314 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 315 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 316 v += 4; 317 } 318 xb +=2; ai++; 319 } 320 321 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 322 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 323 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 #undef __FUNCT__ 328 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 329 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 330 { 331 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 332 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 333 MatScalar *v; 334 PetscErrorCode ierr; 335 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 336 PetscInt nonzerorow=0; 337 338 PetscFunctionBegin; 339 ierr = VecSet(zz,zero);CHKERRQ(ierr); 340 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 341 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 342 343 v = a->a; 344 xb = x; 345 346 for (i=0; i<mbs; i++) { 347 n = ai[1] - ai[0]; /* length of i_th block row of A */ 348 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 349 ib = aj + *ai; 350 jmin = 0; 351 nonzerorow += (n>0); 352 if (*ib == i){ /* (diag of A)*x */ 353 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 354 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 355 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 356 v += 9; jmin++; 357 } 358 for (j=jmin; j<n; j++) { 359 /* (strict lower triangular part of A)*x */ 360 cval = ib[j]*3; 361 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 362 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 363 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 364 /* (strict upper triangular part of A)*x */ 365 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 366 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 367 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 368 v += 9; 369 } 370 xb +=3; ai++; 371 } 372 373 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 374 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 375 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 376 PetscFunctionReturn(0); 377 } 378 379 #undef __FUNCT__ 380 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 381 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 382 { 383 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 384 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 385 MatScalar *v; 386 PetscErrorCode ierr; 387 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 388 PetscInt nonzerorow=0; 389 390 PetscFunctionBegin; 391 ierr = VecSet(zz,zero);CHKERRQ(ierr); 392 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 394 395 v = a->a; 396 xb = x; 397 398 for (i=0; i<mbs; i++) { 399 n = ai[1] - ai[0]; /* length of i_th block row of A */ 400 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 401 ib = aj + *ai; 402 jmin = 0; 403 nonzerorow += (n>0); 404 if (*ib == i){ /* (diag of A)*x */ 405 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 406 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 407 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 408 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 409 v += 16; jmin++; 410 } 411 for (j=jmin; j<n; j++) { 412 /* (strict lower triangular part of A)*x */ 413 cval = ib[j]*4; 414 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 415 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 416 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 417 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 418 /* (strict upper triangular part of A)*x */ 419 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 420 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 421 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 422 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 423 v += 16; 424 } 425 xb +=4; ai++; 426 } 427 428 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 429 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 430 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 431 PetscFunctionReturn(0); 432 } 433 434 #undef __FUNCT__ 435 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 436 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 437 { 438 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 439 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 440 MatScalar *v; 441 PetscErrorCode ierr; 442 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 443 PetscInt nonzerorow=0; 444 445 PetscFunctionBegin; 446 ierr = VecSet(zz,zero);CHKERRQ(ierr); 447 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 448 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 449 450 v = a->a; 451 xb = x; 452 453 for (i=0; i<mbs; i++) { 454 n = ai[1] - ai[0]; /* length of i_th block row of A */ 455 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 456 ib = aj + *ai; 457 jmin = 0; 458 nonzerorow += (n>0); 459 if (*ib == i){ /* (diag of A)*x */ 460 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 461 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 462 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 463 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 464 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 465 v += 25; jmin++; 466 } 467 for (j=jmin; j<n; j++) { 468 /* (strict lower triangular part of A)*x */ 469 cval = ib[j]*5; 470 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 471 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 472 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 473 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 474 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 475 /* (strict upper triangular part of A)*x */ 476 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 477 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 478 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 479 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 480 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 481 v += 25; 482 } 483 xb +=5; ai++; 484 } 485 486 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 487 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 488 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 489 PetscFunctionReturn(0); 490 } 491 492 493 #undef __FUNCT__ 494 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 495 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 496 { 497 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 498 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 499 MatScalar *v; 500 PetscErrorCode ierr; 501 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 502 PetscInt nonzerorow=0; 503 504 PetscFunctionBegin; 505 ierr = VecSet(zz,zero);CHKERRQ(ierr); 506 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 507 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 508 509 v = a->a; 510 xb = x; 511 512 for (i=0; i<mbs; i++) { 513 n = ai[1] - ai[0]; /* length of i_th block row of A */ 514 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 515 ib = aj + *ai; 516 jmin = 0; 517 nonzerorow += (n>0); 518 if (*ib == i){ /* (diag of A)*x */ 519 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 520 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 521 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 522 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 523 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 524 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 525 v += 36; jmin++; 526 } 527 for (j=jmin; j<n; j++) { 528 /* (strict lower triangular part of A)*x */ 529 cval = ib[j]*6; 530 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 531 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 532 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 533 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 534 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 535 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 536 /* (strict upper triangular part of A)*x */ 537 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 538 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 539 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 540 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 541 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 542 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 543 v += 36; 544 } 545 xb +=6; ai++; 546 } 547 548 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 549 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 550 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 551 PetscFunctionReturn(0); 552 } 553 #undef __FUNCT__ 554 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 555 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 556 { 557 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 558 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 559 MatScalar *v; 560 PetscErrorCode ierr; 561 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 562 PetscInt nonzerorow=0; 563 564 PetscFunctionBegin; 565 ierr = VecSet(zz,zero);CHKERRQ(ierr); 566 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 567 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 568 569 v = a->a; 570 xb = x; 571 572 for (i=0; i<mbs; i++) { 573 n = ai[1] - ai[0]; /* length of i_th block row of A */ 574 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 575 ib = aj + *ai; 576 jmin = 0; 577 nonzerorow += (n>0); 578 if (*ib == i){ /* (diag of A)*x */ 579 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 580 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 581 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 582 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 583 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 584 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 585 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 586 v += 49; jmin++; 587 } 588 for (j=jmin; j<n; j++) { 589 /* (strict lower triangular part of A)*x */ 590 cval = ib[j]*7; 591 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 592 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 593 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 594 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 595 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 596 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 597 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 598 /* (strict upper triangular part of A)*x */ 599 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 600 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 601 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 602 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 603 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 604 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 605 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 606 v += 49; 607 } 608 xb +=7; ai++; 609 } 610 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 611 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 612 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 613 PetscFunctionReturn(0); 614 } 615 616 /* 617 This will not work with MatScalar == float because it calls the BLAS 618 */ 619 #undef __FUNCT__ 620 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 621 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 622 { 623 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 624 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 625 MatScalar *v; 626 PetscErrorCode ierr; 627 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 628 PetscInt nonzerorow=0; 629 630 PetscFunctionBegin; 631 ierr = VecSet(zz,zero);CHKERRQ(ierr); 632 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 633 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 634 635 aj = a->j; 636 v = a->a; 637 ii = a->i; 638 639 if (!a->mult_work) { 640 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 641 } 642 work = a->mult_work; 643 644 for (i=0; i<mbs; i++) { 645 n = ii[1] - ii[0]; ncols = n*bs; 646 workt = work; idx=aj+ii[0]; 647 nonzerorow += (n>0); 648 649 /* upper triangular part */ 650 for (j=0; j<n; j++) { 651 xb = x_ptr + bs*(*idx++); 652 for (k=0; k<bs; k++) workt[k] = xb[k]; 653 workt += bs; 654 } 655 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 656 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 657 658 /* strict lower triangular part */ 659 idx = aj+ii[0]; 660 if (*idx == i){ 661 ncols -= bs; v += bs2; idx++; n--; 662 } 663 664 if (ncols > 0){ 665 workt = work; 666 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 667 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 668 for (j=0; j<n; j++) { 669 zb = z_ptr + bs*(*idx++); 670 for (k=0; k<bs; k++) zb[k] += workt[k] ; 671 workt += bs; 672 } 673 } 674 x += bs; v += n*bs2; z += bs; ii++; 675 } 676 677 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 678 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 679 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 680 PetscFunctionReturn(0); 681 } 682 683 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 684 #undef __FUNCT__ 685 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 686 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 687 { 688 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 689 PetscScalar *x,*z,*xb,x1; 690 MatScalar *v; 691 PetscErrorCode ierr; 692 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 693 PetscInt nonzerorow=0; 694 695 PetscFunctionBegin; 696 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 697 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 698 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 699 v = a->a; 700 xb = x; 701 702 for (i=0; i<mbs; i++) { 703 n = ai[1] - ai[0]; /* length of i_th row of A */ 704 x1 = xb[0]; 705 ib = aj + *ai; 706 jmin = 0; 707 nonzerorow += (n>0); 708 if (*ib == i) { /* (diag of A)*x */ 709 z[i] += *v++ * x[*ib++]; jmin++; 710 } 711 for (j=jmin; j<n; j++) { 712 cval = *ib; 713 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 714 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 715 } 716 xb++; ai++; 717 } 718 719 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 720 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 721 722 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 723 PetscFunctionReturn(0); 724 } 725 726 #undef __FUNCT__ 727 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 728 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 729 { 730 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 731 PetscScalar *x,*z,*xb,x1,x2; 732 MatScalar *v; 733 PetscErrorCode ierr; 734 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 735 PetscInt nonzerorow=0; 736 737 PetscFunctionBegin; 738 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 739 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 740 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 741 742 v = a->a; 743 xb = x; 744 745 for (i=0; i<mbs; i++) { 746 n = ai[1] - ai[0]; /* length of i_th block row of A */ 747 x1 = xb[0]; x2 = xb[1]; 748 ib = aj + *ai; 749 jmin = 0; 750 nonzerorow += (n>0); 751 if (*ib == i){ /* (diag of A)*x */ 752 z[2*i] += v[0]*x1 + v[2]*x2; 753 z[2*i+1] += v[2]*x1 + v[3]*x2; 754 v += 4; jmin++; 755 } 756 for (j=jmin; j<n; j++) { 757 /* (strict lower triangular part of A)*x */ 758 cval = ib[j]*2; 759 z[cval] += v[0]*x1 + v[1]*x2; 760 z[cval+1] += v[2]*x1 + v[3]*x2; 761 /* (strict upper triangular part of A)*x */ 762 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 763 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 764 v += 4; 765 } 766 xb +=2; ai++; 767 } 768 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 769 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 770 771 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 #undef __FUNCT__ 776 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 777 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 778 { 779 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 780 PetscScalar *x,*z,*xb,x1,x2,x3; 781 MatScalar *v; 782 PetscErrorCode ierr; 783 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 784 PetscInt nonzerorow=0; 785 786 PetscFunctionBegin; 787 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 788 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 789 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 790 791 v = a->a; 792 xb = x; 793 794 for (i=0; i<mbs; i++) { 795 n = ai[1] - ai[0]; /* length of i_th block row of A */ 796 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 797 ib = aj + *ai; 798 jmin = 0; 799 nonzerorow += (n>0); 800 if (*ib == i){ /* (diag of A)*x */ 801 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 802 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 803 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 804 v += 9; jmin++; 805 } 806 for (j=jmin; j<n; j++) { 807 /* (strict lower triangular part of A)*x */ 808 cval = ib[j]*3; 809 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 810 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 811 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 812 /* (strict upper triangular part of A)*x */ 813 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 814 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 815 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 816 v += 9; 817 } 818 xb +=3; ai++; 819 } 820 821 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 822 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 823 824 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 825 PetscFunctionReturn(0); 826 } 827 828 #undef __FUNCT__ 829 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 830 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 831 { 832 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 833 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 834 MatScalar *v; 835 PetscErrorCode ierr; 836 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 837 PetscInt nonzerorow=0; 838 839 PetscFunctionBegin; 840 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 841 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 842 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 843 844 v = a->a; 845 xb = x; 846 847 for (i=0; i<mbs; i++) { 848 n = ai[1] - ai[0]; /* length of i_th block row of A */ 849 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 850 ib = aj + *ai; 851 jmin = 0; 852 nonzerorow += (n>0); 853 if (*ib == i){ /* (diag of A)*x */ 854 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 855 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 856 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 857 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 858 v += 16; jmin++; 859 } 860 for (j=jmin; j<n; j++) { 861 /* (strict lower triangular part of A)*x */ 862 cval = ib[j]*4; 863 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 864 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 865 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 866 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 867 /* (strict upper triangular part of A)*x */ 868 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 869 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 870 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 871 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 872 v += 16; 873 } 874 xb +=4; ai++; 875 } 876 877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 878 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 879 880 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 881 PetscFunctionReturn(0); 882 } 883 884 #undef __FUNCT__ 885 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 886 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 887 { 888 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 889 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 890 MatScalar *v; 891 PetscErrorCode ierr; 892 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 893 PetscInt nonzerorow=0; 894 895 PetscFunctionBegin; 896 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 897 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 898 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 899 900 v = a->a; 901 xb = x; 902 903 for (i=0; i<mbs; i++) { 904 n = ai[1] - ai[0]; /* length of i_th block row of A */ 905 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 906 ib = aj + *ai; 907 jmin = 0; 908 nonzerorow += (n>0); 909 if (*ib == i){ /* (diag of A)*x */ 910 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 911 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 912 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 913 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 914 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 915 v += 25; jmin++; 916 } 917 for (j=jmin; j<n; j++) { 918 /* (strict lower triangular part of A)*x */ 919 cval = ib[j]*5; 920 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 921 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 922 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 923 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 924 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 925 /* (strict upper triangular part of A)*x */ 926 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 927 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 928 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 929 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 930 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 931 v += 25; 932 } 933 xb +=5; ai++; 934 } 935 936 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 937 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 938 939 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 940 PetscFunctionReturn(0); 941 } 942 #undef __FUNCT__ 943 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 944 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 945 { 946 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 947 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 948 MatScalar *v; 949 PetscErrorCode ierr; 950 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 951 PetscInt nonzerorow=0; 952 953 PetscFunctionBegin; 954 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 955 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 956 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 957 958 v = a->a; 959 xb = x; 960 961 for (i=0; i<mbs; i++) { 962 n = ai[1] - ai[0]; /* length of i_th block row of A */ 963 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 964 ib = aj + *ai; 965 jmin = 0; 966 nonzerorow += (n>0); 967 if (*ib == i){ /* (diag of A)*x */ 968 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 969 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 970 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 971 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 972 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 973 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 974 v += 36; jmin++; 975 } 976 for (j=jmin; j<n; j++) { 977 /* (strict lower triangular part of A)*x */ 978 cval = ib[j]*6; 979 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 980 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 981 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 982 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 983 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 984 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 985 /* (strict upper triangular part of A)*x */ 986 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 987 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 988 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 989 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 990 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 991 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 992 v += 36; 993 } 994 xb +=6; ai++; 995 } 996 997 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 998 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 999 1000 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1001 PetscFunctionReturn(0); 1002 } 1003 1004 #undef __FUNCT__ 1005 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1006 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1007 { 1008 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1009 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1010 MatScalar *v; 1011 PetscErrorCode ierr; 1012 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1013 PetscInt nonzerorow=0; 1014 1015 PetscFunctionBegin; 1016 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1017 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1018 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1019 1020 v = a->a; 1021 xb = x; 1022 1023 for (i=0; i<mbs; i++) { 1024 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1025 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1026 ib = aj + *ai; 1027 jmin = 0; 1028 nonzerorow += (n>0); 1029 if (*ib == i){ /* (diag of A)*x */ 1030 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1031 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1032 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1033 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1034 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1035 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1036 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1037 v += 49; jmin++; 1038 } 1039 for (j=jmin; j<n; j++) { 1040 /* (strict lower triangular part of A)*x */ 1041 cval = ib[j]*7; 1042 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1043 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1044 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1045 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1046 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1047 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1048 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1049 /* (strict upper triangular part of A)*x */ 1050 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1051 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1052 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1053 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1054 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1055 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1056 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1057 v += 49; 1058 } 1059 xb +=7; ai++; 1060 } 1061 1062 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1063 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1064 1065 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1066 PetscFunctionReturn(0); 1067 } 1068 1069 #undef __FUNCT__ 1070 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1071 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1072 { 1073 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1074 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1075 MatScalar *v; 1076 PetscErrorCode ierr; 1077 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1078 PetscInt nonzerorow=0; 1079 1080 PetscFunctionBegin; 1081 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1082 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1083 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1084 1085 aj = a->j; 1086 v = a->a; 1087 ii = a->i; 1088 1089 if (!a->mult_work) { 1090 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1091 } 1092 work = a->mult_work; 1093 1094 1095 for (i=0; i<mbs; i++) { 1096 n = ii[1] - ii[0]; ncols = n*bs; 1097 workt = work; idx=aj+ii[0]; 1098 nonzerorow += (n>0); 1099 1100 /* upper triangular part */ 1101 for (j=0; j<n; j++) { 1102 xb = x_ptr + bs*(*idx++); 1103 for (k=0; k<bs; k++) workt[k] = xb[k]; 1104 workt += bs; 1105 } 1106 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1107 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1108 1109 /* strict lower triangular part */ 1110 idx = aj+ii[0]; 1111 if (*idx == i){ 1112 ncols -= bs; v += bs2; idx++; n--; 1113 } 1114 if (ncols > 0){ 1115 workt = work; 1116 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1117 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1118 for (j=0; j<n; j++) { 1119 zb = z_ptr + bs*(*idx++); 1120 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1121 workt += bs; 1122 } 1123 } 1124 1125 x += bs; v += n*bs2; z += bs; ii++; 1126 } 1127 1128 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1129 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1130 1131 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1132 PetscFunctionReturn(0); 1133 } 1134 1135 #undef __FUNCT__ 1136 #define __FUNCT__ "MatScale_SeqSBAIJ" 1137 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1138 { 1139 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1140 PetscScalar oalpha = alpha; 1141 PetscErrorCode ierr; 1142 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1143 1144 PetscFunctionBegin; 1145 BLASscal_(&totalnz,&oalpha,a->a,&one); 1146 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1147 PetscFunctionReturn(0); 1148 } 1149 1150 #undef __FUNCT__ 1151 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1152 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1153 { 1154 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1155 MatScalar *v = a->a; 1156 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1157 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1158 PetscErrorCode ierr; 1159 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1160 1161 PetscFunctionBegin; 1162 if (type == NORM_FROBENIUS) { 1163 for (k=0; k<mbs; k++){ 1164 jmin = a->i[k]; jmax = a->i[k+1]; 1165 col = aj + jmin; 1166 if (*col == k){ /* diagonal block */ 1167 for (i=0; i<bs2; i++){ 1168 #if defined(PETSC_USE_COMPLEX) 1169 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1170 #else 1171 sum_diag += (*v)*(*v); v++; 1172 #endif 1173 } 1174 jmin++; 1175 } 1176 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1177 for (i=0; i<bs2; i++){ 1178 #if defined(PETSC_USE_COMPLEX) 1179 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1180 #else 1181 sum_off += (*v)*(*v); v++; 1182 #endif 1183 } 1184 } 1185 } 1186 *norm = sqrt(sum_diag + 2*sum_off); 1187 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1188 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1189 jl = il + mbs; 1190 sum = (PetscReal*)(jl + mbs); 1191 for (i=0; i<mbs; i++) jl[i] = mbs; 1192 il[0] = 0; 1193 1194 *norm = 0.0; 1195 for (k=0; k<mbs; k++) { /* k_th block row */ 1196 for (j=0; j<bs; j++) sum[j]=0.0; 1197 /*-- col sum --*/ 1198 i = jl[k]; /* first |A(i,k)| to be added */ 1199 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1200 at step k */ 1201 while (i<mbs){ 1202 nexti = jl[i]; /* next block row to be added */ 1203 ik = il[i]; /* block index of A(i,k) in the array a */ 1204 for (j=0; j<bs; j++){ 1205 v = a->a + ik*bs2 + j*bs; 1206 for (k1=0; k1<bs; k1++) { 1207 sum[j] += PetscAbsScalar(*v); v++; 1208 } 1209 } 1210 /* update il, jl */ 1211 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1212 jmax = a->i[i+1]; 1213 if (jmin < jmax){ 1214 il[i] = jmin; 1215 j = a->j[jmin]; 1216 jl[i] = jl[j]; jl[j]=i; 1217 } 1218 i = nexti; 1219 } 1220 /*-- row sum --*/ 1221 jmin = a->i[k]; jmax = a->i[k+1]; 1222 for (i=jmin; i<jmax; i++) { 1223 for (j=0; j<bs; j++){ 1224 v = a->a + i*bs2 + j; 1225 for (k1=0; k1<bs; k1++){ 1226 sum[j] += PetscAbsScalar(*v); v += bs; 1227 } 1228 } 1229 } 1230 /* add k_th block row to il, jl */ 1231 col = aj+jmin; 1232 if (*col == k) jmin++; 1233 if (jmin < jmax){ 1234 il[k] = jmin; 1235 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1236 } 1237 for (j=0; j<bs; j++){ 1238 if (sum[j] > *norm) *norm = sum[j]; 1239 } 1240 } 1241 ierr = PetscFree(il);CHKERRQ(ierr); 1242 } else { 1243 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1244 } 1245 PetscFunctionReturn(0); 1246 } 1247 1248 #undef __FUNCT__ 1249 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1250 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1251 { 1252 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1253 PetscErrorCode ierr; 1254 1255 PetscFunctionBegin; 1256 1257 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1258 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1259 *flg = PETSC_FALSE; 1260 PetscFunctionReturn(0); 1261 } 1262 1263 /* if the a->i are the same */ 1264 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1265 if (!*flg) { 1266 PetscFunctionReturn(0); 1267 } 1268 1269 /* if a->j are the same */ 1270 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1271 if (!*flg) { 1272 PetscFunctionReturn(0); 1273 } 1274 /* if a->a are the same */ 1275 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1276 PetscFunctionReturn(0); 1277 } 1278 1279 #undef __FUNCT__ 1280 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1281 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1282 { 1283 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1284 PetscErrorCode ierr; 1285 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1286 PetscScalar *x,zero = 0.0; 1287 MatScalar *aa,*aa_j; 1288 1289 PetscFunctionBegin; 1290 bs = A->rmap->bs; 1291 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1292 1293 aa = a->a; 1294 ai = a->i; 1295 aj = a->j; 1296 ambs = a->mbs; 1297 bs2 = a->bs2; 1298 1299 ierr = VecSet(v,zero);CHKERRQ(ierr); 1300 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1301 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1302 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1303 for (i=0; i<ambs; i++) { 1304 j=ai[i]; 1305 if (aj[j] == i) { /* if this is a diagonal element */ 1306 row = i*bs; 1307 aa_j = aa + j*bs2; 1308 if (A->factor && bs==1){ 1309 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1310 } else { 1311 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1312 } 1313 } 1314 } 1315 1316 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1317 PetscFunctionReturn(0); 1318 } 1319 1320 #undef __FUNCT__ 1321 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1322 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1323 { 1324 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1325 PetscScalar *l,x,*li,*ri; 1326 MatScalar *aa,*v; 1327 PetscErrorCode ierr; 1328 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1329 PetscTruth flg; 1330 1331 PetscFunctionBegin; 1332 if (ll != rr){ 1333 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1334 if (!flg) 1335 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1336 } 1337 if (!ll) PetscFunctionReturn(0); 1338 ai = a->i; 1339 aj = a->j; 1340 aa = a->a; 1341 m = A->rmap->N; 1342 bs = A->rmap->bs; 1343 mbs = a->mbs; 1344 bs2 = a->bs2; 1345 1346 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1347 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1348 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1349 for (i=0; i<mbs; i++) { /* for each block row */ 1350 M = ai[i+1] - ai[i]; 1351 li = l + i*bs; 1352 v = aa + bs2*ai[i]; 1353 for (j=0; j<M; j++) { /* for each block */ 1354 ri = l + bs*aj[ai[i]+j]; 1355 for (k=0; k<bs; k++) { 1356 x = ri[k]; 1357 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1358 } 1359 } 1360 } 1361 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1362 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1363 PetscFunctionReturn(0); 1364 } 1365 1366 #undef __FUNCT__ 1367 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1368 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1369 { 1370 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1371 1372 PetscFunctionBegin; 1373 info->block_size = a->bs2; 1374 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1375 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1376 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1377 info->assemblies = A->num_ass; 1378 info->mallocs = a->reallocs; 1379 info->memory = ((PetscObject)A)->mem; 1380 if (A->factor) { 1381 info->fill_ratio_given = A->info.fill_ratio_given; 1382 info->fill_ratio_needed = A->info.fill_ratio_needed; 1383 info->factor_mallocs = A->info.factor_mallocs; 1384 } else { 1385 info->fill_ratio_given = 0; 1386 info->fill_ratio_needed = 0; 1387 info->factor_mallocs = 0; 1388 } 1389 PetscFunctionReturn(0); 1390 } 1391 1392 1393 #undef __FUNCT__ 1394 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1395 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1396 { 1397 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1398 PetscErrorCode ierr; 1399 1400 PetscFunctionBegin; 1401 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1402 PetscFunctionReturn(0); 1403 } 1404 1405 #undef __FUNCT__ 1406 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1407 /* 1408 This code does not work since it only checks the upper triangular part of 1409 the matrix. Hence it is not listed in the function table. 1410 */ 1411 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1412 { 1413 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1414 PetscErrorCode ierr; 1415 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1416 PetscReal atmp; 1417 MatScalar *aa; 1418 PetscScalar *x; 1419 PetscInt ncols,brow,bcol,krow,kcol; 1420 1421 PetscFunctionBegin; 1422 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1423 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1424 bs = A->rmap->bs; 1425 aa = a->a; 1426 ai = a->i; 1427 aj = a->j; 1428 mbs = a->mbs; 1429 1430 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1431 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1432 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1433 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1434 for (i=0; i<mbs; i++) { 1435 ncols = ai[1] - ai[0]; ai++; 1436 brow = bs*i; 1437 for (j=0; j<ncols; j++){ 1438 bcol = bs*(*aj); 1439 for (kcol=0; kcol<bs; kcol++){ 1440 col = bcol + kcol; /* col index */ 1441 for (krow=0; krow<bs; krow++){ 1442 atmp = PetscAbsScalar(*aa); aa++; 1443 row = brow + krow; /* row index */ 1444 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1445 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1446 } 1447 } 1448 aj++; 1449 } 1450 } 1451 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1452 PetscFunctionReturn(0); 1453 } 1454