1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "../src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 102 const PetscInt *irow,*aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag,sorted; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 110 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 117 ssmap = smap; 118 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 144 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 145 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 146 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 147 } 148 c = (Mat_SeqSBAIJ *)(C->data); 149 for (i=0; i<nrows; i++) { 150 row = irow[i]; 151 kstart = ai[row]; 152 kend = kstart + a->ilen[row]; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 mat_ilen = c->ilen + i; 157 for (k=kstart; k<kend; k++) { 158 if ((tcol=ssmap[a->j[k]])) { 159 *mat_j++ = tcol - 1; 160 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 161 mat_a += bs2; 162 (*mat_ilen)++; 163 } 164 } 165 } 166 167 /* Free work space */ 168 ierr = PetscFree(smap);CHKERRQ(ierr); 169 ierr = PetscFree(lens);CHKERRQ(ierr); 170 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 173 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 174 *B = C; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 181 { 182 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 183 IS is1; 184 PetscErrorCode ierr; 185 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 186 const PetscInt *irow; 187 188 PetscFunctionBegin; 189 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 190 191 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 192 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 193 194 /* Verify if the indices corespond to each element in a block 195 and form the IS with compressed IS */ 196 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 197 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 198 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 199 200 count = 0; 201 for (i=0; i<a->mbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 203 if (vary[i]==bs) iary[count++] = i; 204 } 205 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 206 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 207 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 208 209 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 210 ierr = ISDestroy(is1);CHKERRQ(ierr); 211 PetscFunctionReturn(0); 212 } 213 214 #undef __FUNCT__ 215 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 216 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 217 { 218 PetscErrorCode ierr; 219 PetscInt i; 220 221 PetscFunctionBegin; 222 if (scall == MAT_INITIAL_MATRIX) { 223 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 224 } 225 226 for (i=0; i<n; i++) { 227 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 228 } 229 PetscFunctionReturn(0); 230 } 231 232 /* -------------------------------------------------------*/ 233 /* Should check that shapes of vectors and matrices match */ 234 /* -------------------------------------------------------*/ 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 238 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 239 { 240 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 241 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 242 MatScalar *v; 243 PetscErrorCode ierr; 244 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 245 PetscInt nonzerorow=0; 246 247 PetscFunctionBegin; 248 ierr = VecSet(zz,zero);CHKERRQ(ierr); 249 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 250 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 251 252 v = a->a; 253 xb = x; 254 255 for (i=0; i<mbs; i++) { 256 n = ai[1] - ai[0]; /* length of i_th block row of A */ 257 x1 = xb[0]; x2 = xb[1]; 258 ib = aj + *ai; 259 jmin = 0; 260 nonzerorow += (n>0); 261 if (*ib == i){ /* (diag of A)*x */ 262 z[2*i] += v[0]*x1 + v[2]*x2; 263 z[2*i+1] += v[2]*x1 + v[3]*x2; 264 v += 4; jmin++; 265 } 266 for (j=jmin; j<n; j++) { 267 /* (strict lower triangular part of A)*x */ 268 cval = ib[j]*2; 269 z[cval] += v[0]*x1 + v[1]*x2; 270 z[cval+1] += v[2]*x1 + v[3]*x2; 271 /* (strict upper triangular part of A)*x */ 272 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 273 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 274 v += 4; 275 } 276 xb +=2; ai++; 277 } 278 279 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 280 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 281 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 #undef __FUNCT__ 286 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 287 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 290 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 291 MatScalar *v; 292 PetscErrorCode ierr; 293 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 294 PetscInt nonzerorow=0; 295 296 PetscFunctionBegin; 297 ierr = VecSet(zz,zero);CHKERRQ(ierr); 298 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 299 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 300 301 v = a->a; 302 xb = x; 303 304 for (i=0; i<mbs; i++) { 305 n = ai[1] - ai[0]; /* length of i_th block row of A */ 306 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 307 ib = aj + *ai; 308 jmin = 0; 309 nonzerorow += (n>0); 310 if (*ib == i){ /* (diag of A)*x */ 311 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 312 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 313 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 314 v += 9; jmin++; 315 } 316 for (j=jmin; j<n; j++) { 317 /* (strict lower triangular part of A)*x */ 318 cval = ib[j]*3; 319 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 320 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 321 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 322 /* (strict upper triangular part of A)*x */ 323 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 324 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 325 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 326 v += 9; 327 } 328 xb +=3; ai++; 329 } 330 331 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 332 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 333 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 334 PetscFunctionReturn(0); 335 } 336 337 #undef __FUNCT__ 338 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 339 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 340 { 341 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 342 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 343 MatScalar *v; 344 PetscErrorCode ierr; 345 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 346 PetscInt nonzerorow=0; 347 348 PetscFunctionBegin; 349 ierr = VecSet(zz,zero);CHKERRQ(ierr); 350 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 351 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 352 353 v = a->a; 354 xb = x; 355 356 for (i=0; i<mbs; i++) { 357 n = ai[1] - ai[0]; /* length of i_th block row of A */ 358 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 359 ib = aj + *ai; 360 jmin = 0; 361 nonzerorow += (n>0); 362 if (*ib == i){ /* (diag of A)*x */ 363 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 364 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 365 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 366 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 367 v += 16; jmin++; 368 } 369 for (j=jmin; j<n; j++) { 370 /* (strict lower triangular part of A)*x */ 371 cval = ib[j]*4; 372 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 373 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 374 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 375 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 376 /* (strict upper triangular part of A)*x */ 377 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 378 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 379 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 380 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 381 v += 16; 382 } 383 xb +=4; ai++; 384 } 385 386 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 387 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 388 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 389 PetscFunctionReturn(0); 390 } 391 392 #undef __FUNCT__ 393 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 394 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 395 { 396 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 397 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 398 MatScalar *v; 399 PetscErrorCode ierr; 400 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 401 PetscInt nonzerorow=0; 402 403 PetscFunctionBegin; 404 ierr = VecSet(zz,zero);CHKERRQ(ierr); 405 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 406 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 407 408 v = a->a; 409 xb = x; 410 411 for (i=0; i<mbs; i++) { 412 n = ai[1] - ai[0]; /* length of i_th block row of A */ 413 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 414 ib = aj + *ai; 415 jmin = 0; 416 nonzerorow += (n>0); 417 if (*ib == i){ /* (diag of A)*x */ 418 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 419 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 420 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 421 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 422 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 423 v += 25; jmin++; 424 } 425 for (j=jmin; j<n; j++) { 426 /* (strict lower triangular part of A)*x */ 427 cval = ib[j]*5; 428 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 429 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 430 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 431 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 432 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 433 /* (strict upper triangular part of A)*x */ 434 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 435 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 436 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 437 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 438 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 439 v += 25; 440 } 441 xb +=5; ai++; 442 } 443 444 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 445 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 446 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 447 PetscFunctionReturn(0); 448 } 449 450 451 #undef __FUNCT__ 452 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 453 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 454 { 455 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 456 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 457 MatScalar *v; 458 PetscErrorCode ierr; 459 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 460 PetscInt nonzerorow=0; 461 462 PetscFunctionBegin; 463 ierr = VecSet(zz,zero);CHKERRQ(ierr); 464 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 465 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 466 467 v = a->a; 468 xb = x; 469 470 for (i=0; i<mbs; i++) { 471 n = ai[1] - ai[0]; /* length of i_th block row of A */ 472 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 473 ib = aj + *ai; 474 jmin = 0; 475 nonzerorow += (n>0); 476 if (*ib == i){ /* (diag of A)*x */ 477 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 478 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 479 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 480 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 481 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 482 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 483 v += 36; jmin++; 484 } 485 for (j=jmin; j<n; j++) { 486 /* (strict lower triangular part of A)*x */ 487 cval = ib[j]*6; 488 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 489 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 490 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 491 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 492 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 493 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 494 /* (strict upper triangular part of A)*x */ 495 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 496 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 497 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 498 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 499 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 500 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 501 v += 36; 502 } 503 xb +=6; ai++; 504 } 505 506 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 507 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 508 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 509 PetscFunctionReturn(0); 510 } 511 #undef __FUNCT__ 512 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 513 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 514 { 515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 516 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 517 MatScalar *v; 518 PetscErrorCode ierr; 519 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 520 PetscInt nonzerorow=0; 521 522 PetscFunctionBegin; 523 ierr = VecSet(zz,zero);CHKERRQ(ierr); 524 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 525 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 526 527 v = a->a; 528 xb = x; 529 530 for (i=0; i<mbs; i++) { 531 n = ai[1] - ai[0]; /* length of i_th block row of A */ 532 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 533 ib = aj + *ai; 534 jmin = 0; 535 nonzerorow += (n>0); 536 if (*ib == i){ /* (diag of A)*x */ 537 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 538 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 539 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 540 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 541 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 542 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 543 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 544 v += 49; jmin++; 545 } 546 for (j=jmin; j<n; j++) { 547 /* (strict lower triangular part of A)*x */ 548 cval = ib[j]*7; 549 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 550 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 551 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 552 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 553 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 554 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 555 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 556 /* (strict upper triangular part of A)*x */ 557 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 558 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 559 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 560 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 561 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 562 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 563 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 564 v += 49; 565 } 566 xb +=7; ai++; 567 } 568 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 569 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 570 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 571 PetscFunctionReturn(0); 572 } 573 574 /* 575 This will not work with MatScalar == float because it calls the BLAS 576 */ 577 #undef __FUNCT__ 578 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 579 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 580 { 581 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 582 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 583 MatScalar *v; 584 PetscErrorCode ierr; 585 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 586 PetscInt nonzerorow=0; 587 588 PetscFunctionBegin; 589 ierr = VecSet(zz,zero);CHKERRQ(ierr); 590 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 591 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 592 593 aj = a->j; 594 v = a->a; 595 ii = a->i; 596 597 if (!a->mult_work) { 598 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 599 } 600 work = a->mult_work; 601 602 for (i=0; i<mbs; i++) { 603 n = ii[1] - ii[0]; ncols = n*bs; 604 workt = work; idx=aj+ii[0]; 605 nonzerorow += (n>0); 606 607 /* upper triangular part */ 608 for (j=0; j<n; j++) { 609 xb = x_ptr + bs*(*idx++); 610 for (k=0; k<bs; k++) workt[k] = xb[k]; 611 workt += bs; 612 } 613 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 614 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 615 616 /* strict lower triangular part */ 617 idx = aj+ii[0]; 618 if (*idx == i){ 619 ncols -= bs; v += bs2; idx++; n--; 620 } 621 622 if (ncols > 0){ 623 workt = work; 624 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 625 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 626 for (j=0; j<n; j++) { 627 zb = z_ptr + bs*(*idx++); 628 for (k=0; k<bs; k++) zb[k] += workt[k] ; 629 workt += bs; 630 } 631 } 632 x += bs; v += n*bs2; z += bs; ii++; 633 } 634 635 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 636 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 637 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 638 PetscFunctionReturn(0); 639 } 640 641 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 642 #undef __FUNCT__ 643 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 644 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 645 { 646 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 647 PetscScalar *x,*z,*xb,x1; 648 MatScalar *v; 649 PetscErrorCode ierr; 650 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 651 PetscInt nonzerorow=0; 652 653 PetscFunctionBegin; 654 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 655 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 656 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 657 v = a->a; 658 xb = x; 659 660 for (i=0; i<mbs; i++) { 661 n = ai[1] - ai[0]; /* length of i_th row of A */ 662 x1 = xb[0]; 663 ib = aj + *ai; 664 jmin = 0; 665 nonzerorow += (n>0); 666 if (*ib == i) { /* (diag of A)*x */ 667 z[i] += *v++ * x[*ib++]; jmin++; 668 } 669 for (j=jmin; j<n; j++) { 670 cval = *ib; 671 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 672 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 673 } 674 xb++; ai++; 675 } 676 677 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 678 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 679 680 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 681 PetscFunctionReturn(0); 682 } 683 684 #undef __FUNCT__ 685 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 686 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 687 { 688 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 689 PetscScalar *x,*z,*xb,x1,x2; 690 MatScalar *v; 691 PetscErrorCode ierr; 692 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 693 PetscInt nonzerorow=0; 694 695 PetscFunctionBegin; 696 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 697 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 698 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 699 700 v = a->a; 701 xb = x; 702 703 for (i=0; i<mbs; i++) { 704 n = ai[1] - ai[0]; /* length of i_th block row of A */ 705 x1 = xb[0]; x2 = xb[1]; 706 ib = aj + *ai; 707 jmin = 0; 708 nonzerorow += (n>0); 709 if (*ib == i){ /* (diag of A)*x */ 710 z[2*i] += v[0]*x1 + v[2]*x2; 711 z[2*i+1] += v[2]*x1 + v[3]*x2; 712 v += 4; jmin++; 713 } 714 for (j=jmin; j<n; j++) { 715 /* (strict lower triangular part of A)*x */ 716 cval = ib[j]*2; 717 z[cval] += v[0]*x1 + v[1]*x2; 718 z[cval+1] += v[2]*x1 + v[3]*x2; 719 /* (strict upper triangular part of A)*x */ 720 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 721 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 722 v += 4; 723 } 724 xb +=2; ai++; 725 } 726 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 727 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 728 729 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 #undef __FUNCT__ 734 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 735 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 736 { 737 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 738 PetscScalar *x,*z,*xb,x1,x2,x3; 739 MatScalar *v; 740 PetscErrorCode ierr; 741 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 742 PetscInt nonzerorow=0; 743 744 PetscFunctionBegin; 745 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 746 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 747 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 748 749 v = a->a; 750 xb = x; 751 752 for (i=0; i<mbs; i++) { 753 n = ai[1] - ai[0]; /* length of i_th block row of A */ 754 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 755 ib = aj + *ai; 756 jmin = 0; 757 nonzerorow += (n>0); 758 if (*ib == i){ /* (diag of A)*x */ 759 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 760 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 761 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 762 v += 9; jmin++; 763 } 764 for (j=jmin; j<n; j++) { 765 /* (strict lower triangular part of A)*x */ 766 cval = ib[j]*3; 767 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 768 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 769 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 770 /* (strict upper triangular part of A)*x */ 771 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 772 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 773 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 774 v += 9; 775 } 776 xb +=3; ai++; 777 } 778 779 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 780 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 781 782 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 783 PetscFunctionReturn(0); 784 } 785 786 #undef __FUNCT__ 787 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 788 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 789 { 790 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 791 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 792 MatScalar *v; 793 PetscErrorCode ierr; 794 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 795 PetscInt nonzerorow=0; 796 797 PetscFunctionBegin; 798 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 799 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 800 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 801 802 v = a->a; 803 xb = x; 804 805 for (i=0; i<mbs; i++) { 806 n = ai[1] - ai[0]; /* length of i_th block row of A */ 807 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 808 ib = aj + *ai; 809 jmin = 0; 810 nonzerorow += (n>0); 811 if (*ib == i){ /* (diag of A)*x */ 812 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 813 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 814 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 815 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 816 v += 16; jmin++; 817 } 818 for (j=jmin; j<n; j++) { 819 /* (strict lower triangular part of A)*x */ 820 cval = ib[j]*4; 821 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 822 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 823 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 824 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 825 /* (strict upper triangular part of A)*x */ 826 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 827 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 828 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 829 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 830 v += 16; 831 } 832 xb +=4; ai++; 833 } 834 835 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 836 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 837 838 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 839 PetscFunctionReturn(0); 840 } 841 842 #undef __FUNCT__ 843 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 844 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 845 { 846 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 847 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 848 MatScalar *v; 849 PetscErrorCode ierr; 850 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 851 PetscInt nonzerorow=0; 852 853 PetscFunctionBegin; 854 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 855 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 856 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 857 858 v = a->a; 859 xb = x; 860 861 for (i=0; i<mbs; i++) { 862 n = ai[1] - ai[0]; /* length of i_th block row of A */ 863 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 864 ib = aj + *ai; 865 jmin = 0; 866 nonzerorow += (n>0); 867 if (*ib == i){ /* (diag of A)*x */ 868 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 869 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 870 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 871 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 872 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 873 v += 25; jmin++; 874 } 875 for (j=jmin; j<n; j++) { 876 /* (strict lower triangular part of A)*x */ 877 cval = ib[j]*5; 878 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 879 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 880 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 881 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 882 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 883 /* (strict upper triangular part of A)*x */ 884 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 885 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 886 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 887 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 888 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 889 v += 25; 890 } 891 xb +=5; ai++; 892 } 893 894 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 895 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 896 897 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 898 PetscFunctionReturn(0); 899 } 900 #undef __FUNCT__ 901 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 902 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 903 { 904 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 905 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 906 MatScalar *v; 907 PetscErrorCode ierr; 908 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 909 PetscInt nonzerorow=0; 910 911 PetscFunctionBegin; 912 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 915 916 v = a->a; 917 xb = x; 918 919 for (i=0; i<mbs; i++) { 920 n = ai[1] - ai[0]; /* length of i_th block row of A */ 921 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 922 ib = aj + *ai; 923 jmin = 0; 924 nonzerorow += (n>0); 925 if (*ib == i){ /* (diag of A)*x */ 926 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 927 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 928 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 929 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 930 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 931 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 932 v += 36; jmin++; 933 } 934 for (j=jmin; j<n; j++) { 935 /* (strict lower triangular part of A)*x */ 936 cval = ib[j]*6; 937 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 938 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 939 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 940 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 941 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 942 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 943 /* (strict upper triangular part of A)*x */ 944 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 945 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 946 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 947 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 948 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 949 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 950 v += 36; 951 } 952 xb +=6; ai++; 953 } 954 955 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 956 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 957 958 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 959 PetscFunctionReturn(0); 960 } 961 962 #undef __FUNCT__ 963 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 964 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 965 { 966 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 967 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 968 MatScalar *v; 969 PetscErrorCode ierr; 970 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 971 PetscInt nonzerorow=0; 972 973 PetscFunctionBegin; 974 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 975 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 976 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 977 978 v = a->a; 979 xb = x; 980 981 for (i=0; i<mbs; i++) { 982 n = ai[1] - ai[0]; /* length of i_th block row of A */ 983 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 984 ib = aj + *ai; 985 jmin = 0; 986 nonzerorow += (n>0); 987 if (*ib == i){ /* (diag of A)*x */ 988 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 989 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 990 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 991 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 992 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 993 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 994 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 995 v += 49; jmin++; 996 } 997 for (j=jmin; j<n; j++) { 998 /* (strict lower triangular part of A)*x */ 999 cval = ib[j]*7; 1000 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1001 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1002 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1003 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1004 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1005 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1006 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1007 /* (strict upper triangular part of A)*x */ 1008 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1009 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1010 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1011 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1012 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1013 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1014 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1015 v += 49; 1016 } 1017 xb +=7; ai++; 1018 } 1019 1020 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1021 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1022 1023 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1024 PetscFunctionReturn(0); 1025 } 1026 1027 #undef __FUNCT__ 1028 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1029 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1030 { 1031 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1032 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1033 MatScalar *v; 1034 PetscErrorCode ierr; 1035 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1036 PetscInt nonzerorow=0; 1037 1038 PetscFunctionBegin; 1039 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1040 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1041 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1042 1043 aj = a->j; 1044 v = a->a; 1045 ii = a->i; 1046 1047 if (!a->mult_work) { 1048 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1049 } 1050 work = a->mult_work; 1051 1052 1053 for (i=0; i<mbs; i++) { 1054 n = ii[1] - ii[0]; ncols = n*bs; 1055 workt = work; idx=aj+ii[0]; 1056 nonzerorow += (n>0); 1057 1058 /* upper triangular part */ 1059 for (j=0; j<n; j++) { 1060 xb = x_ptr + bs*(*idx++); 1061 for (k=0; k<bs; k++) workt[k] = xb[k]; 1062 workt += bs; 1063 } 1064 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1065 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1066 1067 /* strict lower triangular part */ 1068 idx = aj+ii[0]; 1069 if (*idx == i){ 1070 ncols -= bs; v += bs2; idx++; n--; 1071 } 1072 if (ncols > 0){ 1073 workt = work; 1074 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1075 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1076 for (j=0; j<n; j++) { 1077 zb = z_ptr + bs*(*idx++); 1078 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1079 workt += bs; 1080 } 1081 } 1082 1083 x += bs; v += n*bs2; z += bs; ii++; 1084 } 1085 1086 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1087 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1088 1089 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1090 PetscFunctionReturn(0); 1091 } 1092 1093 #undef __FUNCT__ 1094 #define __FUNCT__ "MatScale_SeqSBAIJ" 1095 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1096 { 1097 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1098 PetscScalar oalpha = alpha; 1099 PetscErrorCode ierr; 1100 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1101 1102 PetscFunctionBegin; 1103 BLASscal_(&totalnz,&oalpha,a->a,&one); 1104 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1105 PetscFunctionReturn(0); 1106 } 1107 1108 #undef __FUNCT__ 1109 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1110 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1111 { 1112 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1113 MatScalar *v = a->a; 1114 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1115 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1116 PetscErrorCode ierr; 1117 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1118 1119 PetscFunctionBegin; 1120 if (type == NORM_FROBENIUS) { 1121 for (k=0; k<mbs; k++){ 1122 jmin = a->i[k]; jmax = a->i[k+1]; 1123 col = aj + jmin; 1124 if (*col == k){ /* diagonal block */ 1125 for (i=0; i<bs2; i++){ 1126 #if defined(PETSC_USE_COMPLEX) 1127 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1128 #else 1129 sum_diag += (*v)*(*v); v++; 1130 #endif 1131 } 1132 jmin++; 1133 } 1134 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1135 for (i=0; i<bs2; i++){ 1136 #if defined(PETSC_USE_COMPLEX) 1137 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1138 #else 1139 sum_off += (*v)*(*v); v++; 1140 #endif 1141 } 1142 } 1143 } 1144 *norm = sqrt(sum_diag + 2*sum_off); 1145 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1146 ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 1147 for (i=0; i<mbs; i++) jl[i] = mbs; 1148 il[0] = 0; 1149 1150 *norm = 0.0; 1151 for (k=0; k<mbs; k++) { /* k_th block row */ 1152 for (j=0; j<bs; j++) sum[j]=0.0; 1153 /*-- col sum --*/ 1154 i = jl[k]; /* first |A(i,k)| to be added */ 1155 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1156 at step k */ 1157 while (i<mbs){ 1158 nexti = jl[i]; /* next block row to be added */ 1159 ik = il[i]; /* block index of A(i,k) in the array a */ 1160 for (j=0; j<bs; j++){ 1161 v = a->a + ik*bs2 + j*bs; 1162 for (k1=0; k1<bs; k1++) { 1163 sum[j] += PetscAbsScalar(*v); v++; 1164 } 1165 } 1166 /* update il, jl */ 1167 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1168 jmax = a->i[i+1]; 1169 if (jmin < jmax){ 1170 il[i] = jmin; 1171 j = a->j[jmin]; 1172 jl[i] = jl[j]; jl[j]=i; 1173 } 1174 i = nexti; 1175 } 1176 /*-- row sum --*/ 1177 jmin = a->i[k]; jmax = a->i[k+1]; 1178 for (i=jmin; i<jmax; i++) { 1179 for (j=0; j<bs; j++){ 1180 v = a->a + i*bs2 + j; 1181 for (k1=0; k1<bs; k1++){ 1182 sum[j] += PetscAbsScalar(*v); v += bs; 1183 } 1184 } 1185 } 1186 /* add k_th block row to il, jl */ 1187 col = aj+jmin; 1188 if (*col == k) jmin++; 1189 if (jmin < jmax){ 1190 il[k] = jmin; 1191 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1192 } 1193 for (j=0; j<bs; j++){ 1194 if (sum[j] > *norm) *norm = sum[j]; 1195 } 1196 } 1197 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1198 } else { 1199 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1200 } 1201 PetscFunctionReturn(0); 1202 } 1203 1204 #undef __FUNCT__ 1205 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1206 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1207 { 1208 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1209 PetscErrorCode ierr; 1210 1211 PetscFunctionBegin; 1212 1213 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1214 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1215 *flg = PETSC_FALSE; 1216 PetscFunctionReturn(0); 1217 } 1218 1219 /* if the a->i are the same */ 1220 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1221 if (!*flg) { 1222 PetscFunctionReturn(0); 1223 } 1224 1225 /* if a->j are the same */ 1226 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1227 if (!*flg) { 1228 PetscFunctionReturn(0); 1229 } 1230 /* if a->a are the same */ 1231 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1232 PetscFunctionReturn(0); 1233 } 1234 1235 #undef __FUNCT__ 1236 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1237 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1238 { 1239 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1240 PetscErrorCode ierr; 1241 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1242 PetscScalar *x,zero = 0.0; 1243 MatScalar *aa,*aa_j; 1244 1245 PetscFunctionBegin; 1246 bs = A->rmap->bs; 1247 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1248 1249 aa = a->a; 1250 ai = a->i; 1251 aj = a->j; 1252 ambs = a->mbs; 1253 bs2 = a->bs2; 1254 1255 ierr = VecSet(v,zero);CHKERRQ(ierr); 1256 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1257 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1258 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1259 for (i=0; i<ambs; i++) { 1260 j=ai[i]; 1261 if (aj[j] == i) { /* if this is a diagonal element */ 1262 row = i*bs; 1263 aa_j = aa + j*bs2; 1264 if (A->factor && bs==1){ 1265 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1266 } else { 1267 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1268 } 1269 } 1270 } 1271 1272 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1273 PetscFunctionReturn(0); 1274 } 1275 1276 #undef __FUNCT__ 1277 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1278 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1279 { 1280 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1281 PetscScalar *l,x,*li,*ri; 1282 MatScalar *aa,*v; 1283 PetscErrorCode ierr; 1284 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1285 PetscTruth flg; 1286 1287 PetscFunctionBegin; 1288 if (ll != rr){ 1289 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1290 if (!flg) 1291 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1292 } 1293 if (!ll) PetscFunctionReturn(0); 1294 ai = a->i; 1295 aj = a->j; 1296 aa = a->a; 1297 m = A->rmap->N; 1298 bs = A->rmap->bs; 1299 mbs = a->mbs; 1300 bs2 = a->bs2; 1301 1302 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1303 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1304 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1305 for (i=0; i<mbs; i++) { /* for each block row */ 1306 M = ai[i+1] - ai[i]; 1307 li = l + i*bs; 1308 v = aa + bs2*ai[i]; 1309 for (j=0; j<M; j++) { /* for each block */ 1310 ri = l + bs*aj[ai[i]+j]; 1311 for (k=0; k<bs; k++) { 1312 x = ri[k]; 1313 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1314 } 1315 } 1316 } 1317 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1318 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1319 PetscFunctionReturn(0); 1320 } 1321 1322 #undef __FUNCT__ 1323 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1324 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1325 { 1326 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1327 1328 PetscFunctionBegin; 1329 info->block_size = a->bs2; 1330 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1331 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1332 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1333 info->assemblies = A->num_ass; 1334 info->mallocs = a->reallocs; 1335 info->memory = ((PetscObject)A)->mem; 1336 if (A->factor) { 1337 info->fill_ratio_given = A->info.fill_ratio_given; 1338 info->fill_ratio_needed = A->info.fill_ratio_needed; 1339 info->factor_mallocs = A->info.factor_mallocs; 1340 } else { 1341 info->fill_ratio_given = 0; 1342 info->fill_ratio_needed = 0; 1343 info->factor_mallocs = 0; 1344 } 1345 PetscFunctionReturn(0); 1346 } 1347 1348 1349 #undef __FUNCT__ 1350 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1351 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1352 { 1353 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1354 PetscErrorCode ierr; 1355 1356 PetscFunctionBegin; 1357 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1358 PetscFunctionReturn(0); 1359 } 1360 1361 #undef __FUNCT__ 1362 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1363 /* 1364 This code does not work since it only checks the upper triangular part of 1365 the matrix. Hence it is not listed in the function table. 1366 */ 1367 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1368 { 1369 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1370 PetscErrorCode ierr; 1371 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1372 PetscReal atmp; 1373 MatScalar *aa; 1374 PetscScalar *x; 1375 PetscInt ncols,brow,bcol,krow,kcol; 1376 1377 PetscFunctionBegin; 1378 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1379 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1380 bs = A->rmap->bs; 1381 aa = a->a; 1382 ai = a->i; 1383 aj = a->j; 1384 mbs = a->mbs; 1385 1386 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1387 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1388 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1389 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1390 for (i=0; i<mbs; i++) { 1391 ncols = ai[1] - ai[0]; ai++; 1392 brow = bs*i; 1393 for (j=0; j<ncols; j++){ 1394 bcol = bs*(*aj); 1395 for (kcol=0; kcol<bs; kcol++){ 1396 col = bcol + kcol; /* col index */ 1397 for (krow=0; krow<bs; krow++){ 1398 atmp = PetscAbsScalar(*aa); aa++; 1399 row = brow + krow; /* row index */ 1400 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1401 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1402 } 1403 } 1404 aj++; 1405 } 1406 } 1407 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1408 PetscFunctionReturn(0); 1409 } 1410