1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "../src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 102 const PetscInt *irow,*aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag,sorted; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 110 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ssmap = smap; 117 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 144 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 145 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 146 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 147 } 148 c = (Mat_SeqSBAIJ *)(C->data); 149 for (i=0; i<nrows; i++) { 150 row = irow[i]; 151 kstart = ai[row]; 152 kend = kstart + a->ilen[row]; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 mat_ilen = c->ilen + i; 157 for (k=kstart; k<kend; k++) { 158 if ((tcol=ssmap[a->j[k]])) { 159 *mat_j++ = tcol - 1; 160 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 161 mat_a += bs2; 162 (*mat_ilen)++; 163 } 164 } 165 } 166 167 /* Free work space */ 168 ierr = PetscFree(smap);CHKERRQ(ierr); 169 ierr = PetscFree(lens);CHKERRQ(ierr); 170 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 173 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 174 *B = C; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 181 { 182 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 183 IS is1; 184 PetscErrorCode ierr; 185 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 186 const PetscInt *irow; 187 188 PetscFunctionBegin; 189 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 190 191 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 192 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 193 194 /* Verify if the indices corespond to each element in a block 195 and form the IS with compressed IS */ 196 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 197 iary = vary + a->mbs; 198 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 199 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 200 201 count = 0; 202 for (i=0; i<a->mbs; i++) { 203 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 204 if (vary[i]==bs) iary[count++] = i; 205 } 206 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 207 208 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 209 ierr = PetscFree(vary);CHKERRQ(ierr); 210 211 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 212 ISDestroy(is1); 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 219 { 220 PetscErrorCode ierr; 221 PetscInt i; 222 223 PetscFunctionBegin; 224 if (scall == MAT_INITIAL_MATRIX) { 225 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 226 } 227 228 for (i=0; i<n; i++) { 229 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 230 } 231 PetscFunctionReturn(0); 232 } 233 234 /* -------------------------------------------------------*/ 235 /* Should check that shapes of vectors and matrices match */ 236 /* -------------------------------------------------------*/ 237 #include "petscblaslapack.h" 238 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 242 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 243 { 244 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 245 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 246 MatScalar *v; 247 PetscErrorCode ierr; 248 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 249 PetscInt nonzerorow=0; 250 251 PetscFunctionBegin; 252 ierr = VecSet(zz,zero);CHKERRQ(ierr); 253 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 254 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 255 256 v = a->a; 257 xb = x; 258 259 for (i=0; i<mbs; i++) { 260 n = ai[1] - ai[0]; /* length of i_th block row of A */ 261 x1 = xb[0]; x2 = xb[1]; 262 ib = aj + *ai; 263 jmin = 0; 264 nonzerorow += (n>0); 265 if (*ib == i){ /* (diag of A)*x */ 266 z[2*i] += v[0]*x1 + v[2]*x2; 267 z[2*i+1] += v[2]*x1 + v[3]*x2; 268 v += 4; jmin++; 269 } 270 for (j=jmin; j<n; j++) { 271 /* (strict lower triangular part of A)*x */ 272 cval = ib[j]*2; 273 z[cval] += v[0]*x1 + v[1]*x2; 274 z[cval+1] += v[2]*x1 + v[3]*x2; 275 /* (strict upper triangular part of A)*x */ 276 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 277 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 278 v += 4; 279 } 280 xb +=2; ai++; 281 } 282 283 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 284 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 285 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 286 PetscFunctionReturn(0); 287 } 288 289 #undef __FUNCT__ 290 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 291 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 292 { 293 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 294 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 295 MatScalar *v; 296 PetscErrorCode ierr; 297 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 298 PetscInt nonzerorow=0; 299 300 PetscFunctionBegin; 301 ierr = VecSet(zz,zero);CHKERRQ(ierr); 302 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 303 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 304 305 v = a->a; 306 xb = x; 307 308 for (i=0; i<mbs; i++) { 309 n = ai[1] - ai[0]; /* length of i_th block row of A */ 310 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 311 ib = aj + *ai; 312 jmin = 0; 313 nonzerorow += (n>0); 314 if (*ib == i){ /* (diag of A)*x */ 315 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 316 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 317 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 318 v += 9; jmin++; 319 } 320 for (j=jmin; j<n; j++) { 321 /* (strict lower triangular part of A)*x */ 322 cval = ib[j]*3; 323 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 324 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 325 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 326 /* (strict upper triangular part of A)*x */ 327 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 328 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 329 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 330 v += 9; 331 } 332 xb +=3; ai++; 333 } 334 335 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 336 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 337 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 338 PetscFunctionReturn(0); 339 } 340 341 #undef __FUNCT__ 342 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 343 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 344 { 345 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 346 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 347 MatScalar *v; 348 PetscErrorCode ierr; 349 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 350 PetscInt nonzerorow=0; 351 352 PetscFunctionBegin; 353 ierr = VecSet(zz,zero);CHKERRQ(ierr); 354 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 355 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 356 357 v = a->a; 358 xb = x; 359 360 for (i=0; i<mbs; i++) { 361 n = ai[1] - ai[0]; /* length of i_th block row of A */ 362 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 363 ib = aj + *ai; 364 jmin = 0; 365 nonzerorow += (n>0); 366 if (*ib == i){ /* (diag of A)*x */ 367 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 368 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 369 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 370 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 371 v += 16; jmin++; 372 } 373 for (j=jmin; j<n; j++) { 374 /* (strict lower triangular part of A)*x */ 375 cval = ib[j]*4; 376 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 377 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 378 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 379 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 380 /* (strict upper triangular part of A)*x */ 381 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 382 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 383 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 384 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 385 v += 16; 386 } 387 xb +=4; ai++; 388 } 389 390 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 391 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 392 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 393 PetscFunctionReturn(0); 394 } 395 396 #undef __FUNCT__ 397 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 398 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 399 { 400 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 401 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 402 MatScalar *v; 403 PetscErrorCode ierr; 404 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 405 PetscInt nonzerorow=0; 406 407 PetscFunctionBegin; 408 ierr = VecSet(zz,zero);CHKERRQ(ierr); 409 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 411 412 v = a->a; 413 xb = x; 414 415 for (i=0; i<mbs; i++) { 416 n = ai[1] - ai[0]; /* length of i_th block row of A */ 417 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 418 ib = aj + *ai; 419 jmin = 0; 420 nonzerorow += (n>0); 421 if (*ib == i){ /* (diag of A)*x */ 422 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 423 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 424 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 425 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 426 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 427 v += 25; jmin++; 428 } 429 for (j=jmin; j<n; j++) { 430 /* (strict lower triangular part of A)*x */ 431 cval = ib[j]*5; 432 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 433 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 434 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 435 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 436 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 437 /* (strict upper triangular part of A)*x */ 438 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 439 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 440 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 441 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 442 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 443 v += 25; 444 } 445 xb +=5; ai++; 446 } 447 448 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 449 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 450 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 451 PetscFunctionReturn(0); 452 } 453 454 455 #undef __FUNCT__ 456 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 457 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 458 { 459 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 460 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 461 MatScalar *v; 462 PetscErrorCode ierr; 463 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 464 PetscInt nonzerorow=0; 465 466 PetscFunctionBegin; 467 ierr = VecSet(zz,zero);CHKERRQ(ierr); 468 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 469 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 470 471 v = a->a; 472 xb = x; 473 474 for (i=0; i<mbs; i++) { 475 n = ai[1] - ai[0]; /* length of i_th block row of A */ 476 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 477 ib = aj + *ai; 478 jmin = 0; 479 nonzerorow += (n>0); 480 if (*ib == i){ /* (diag of A)*x */ 481 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 482 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 483 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 484 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 485 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 486 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 487 v += 36; jmin++; 488 } 489 for (j=jmin; j<n; j++) { 490 /* (strict lower triangular part of A)*x */ 491 cval = ib[j]*6; 492 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 493 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 494 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 495 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 496 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 497 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 498 /* (strict upper triangular part of A)*x */ 499 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 500 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 501 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 502 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 503 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 504 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 505 v += 36; 506 } 507 xb +=6; ai++; 508 } 509 510 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 511 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 512 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 513 PetscFunctionReturn(0); 514 } 515 #undef __FUNCT__ 516 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 517 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 518 { 519 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 520 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 521 MatScalar *v; 522 PetscErrorCode ierr; 523 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 524 PetscInt nonzerorow=0; 525 526 PetscFunctionBegin; 527 ierr = VecSet(zz,zero);CHKERRQ(ierr); 528 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 529 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 530 531 v = a->a; 532 xb = x; 533 534 for (i=0; i<mbs; i++) { 535 n = ai[1] - ai[0]; /* length of i_th block row of A */ 536 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 537 ib = aj + *ai; 538 jmin = 0; 539 nonzerorow += (n>0); 540 if (*ib == i){ /* (diag of A)*x */ 541 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 542 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 543 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 544 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 545 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 546 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 547 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 548 v += 49; jmin++; 549 } 550 for (j=jmin; j<n; j++) { 551 /* (strict lower triangular part of A)*x */ 552 cval = ib[j]*7; 553 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 554 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 555 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 556 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 557 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 558 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 559 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 560 /* (strict upper triangular part of A)*x */ 561 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 562 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 563 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 564 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 565 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 566 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 567 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 568 v += 49; 569 } 570 xb +=7; ai++; 571 } 572 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 573 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 574 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 575 PetscFunctionReturn(0); 576 } 577 578 /* 579 This will not work with MatScalar == float because it calls the BLAS 580 */ 581 #undef __FUNCT__ 582 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 583 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 584 { 585 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 586 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 587 MatScalar *v; 588 PetscErrorCode ierr; 589 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 590 PetscInt nonzerorow=0; 591 592 PetscFunctionBegin; 593 ierr = VecSet(zz,zero);CHKERRQ(ierr); 594 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 595 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 596 597 aj = a->j; 598 v = a->a; 599 ii = a->i; 600 601 if (!a->mult_work) { 602 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 603 } 604 work = a->mult_work; 605 606 for (i=0; i<mbs; i++) { 607 n = ii[1] - ii[0]; ncols = n*bs; 608 workt = work; idx=aj+ii[0]; 609 nonzerorow += (n>0); 610 611 /* upper triangular part */ 612 for (j=0; j<n; j++) { 613 xb = x_ptr + bs*(*idx++); 614 for (k=0; k<bs; k++) workt[k] = xb[k]; 615 workt += bs; 616 } 617 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 618 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 619 620 /* strict lower triangular part */ 621 idx = aj+ii[0]; 622 if (*idx == i){ 623 ncols -= bs; v += bs2; idx++; n--; 624 } 625 626 if (ncols > 0){ 627 workt = work; 628 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 629 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 630 for (j=0; j<n; j++) { 631 zb = z_ptr + bs*(*idx++); 632 for (k=0; k<bs; k++) zb[k] += workt[k] ; 633 workt += bs; 634 } 635 } 636 x += bs; v += n*bs2; z += bs; ii++; 637 } 638 639 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 640 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 641 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 642 PetscFunctionReturn(0); 643 } 644 645 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 646 #undef __FUNCT__ 647 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 648 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 649 { 650 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 651 PetscScalar *x,*z,*xb,x1; 652 MatScalar *v; 653 PetscErrorCode ierr; 654 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 655 PetscInt nonzerorow=0; 656 657 PetscFunctionBegin; 658 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 659 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 660 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 661 v = a->a; 662 xb = x; 663 664 for (i=0; i<mbs; i++) { 665 n = ai[1] - ai[0]; /* length of i_th row of A */ 666 x1 = xb[0]; 667 ib = aj + *ai; 668 jmin = 0; 669 nonzerorow += (n>0); 670 if (*ib == i) { /* (diag of A)*x */ 671 z[i] += *v++ * x[*ib++]; jmin++; 672 } 673 for (j=jmin; j<n; j++) { 674 cval = *ib; 675 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 676 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 677 } 678 xb++; ai++; 679 } 680 681 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 682 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 683 684 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 685 PetscFunctionReturn(0); 686 } 687 688 #undef __FUNCT__ 689 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 690 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 691 { 692 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 693 PetscScalar *x,*z,*xb,x1,x2; 694 MatScalar *v; 695 PetscErrorCode ierr; 696 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 697 PetscInt nonzerorow=0; 698 699 PetscFunctionBegin; 700 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 701 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 702 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 703 704 v = a->a; 705 xb = x; 706 707 for (i=0; i<mbs; i++) { 708 n = ai[1] - ai[0]; /* length of i_th block row of A */ 709 x1 = xb[0]; x2 = xb[1]; 710 ib = aj + *ai; 711 jmin = 0; 712 nonzerorow += (n>0); 713 if (*ib == i){ /* (diag of A)*x */ 714 z[2*i] += v[0]*x1 + v[2]*x2; 715 z[2*i+1] += v[2]*x1 + v[3]*x2; 716 v += 4; jmin++; 717 } 718 for (j=jmin; j<n; j++) { 719 /* (strict lower triangular part of A)*x */ 720 cval = ib[j]*2; 721 z[cval] += v[0]*x1 + v[1]*x2; 722 z[cval+1] += v[2]*x1 + v[3]*x2; 723 /* (strict upper triangular part of A)*x */ 724 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 725 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 726 v += 4; 727 } 728 xb +=2; ai++; 729 } 730 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 731 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 732 733 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 739 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 740 { 741 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 742 PetscScalar *x,*z,*xb,x1,x2,x3; 743 MatScalar *v; 744 PetscErrorCode ierr; 745 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 746 PetscInt nonzerorow=0; 747 748 PetscFunctionBegin; 749 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 750 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 751 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 752 753 v = a->a; 754 xb = x; 755 756 for (i=0; i<mbs; i++) { 757 n = ai[1] - ai[0]; /* length of i_th block row of A */ 758 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 759 ib = aj + *ai; 760 jmin = 0; 761 nonzerorow += (n>0); 762 if (*ib == i){ /* (diag of A)*x */ 763 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 764 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 765 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 766 v += 9; jmin++; 767 } 768 for (j=jmin; j<n; j++) { 769 /* (strict lower triangular part of A)*x */ 770 cval = ib[j]*3; 771 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 772 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 773 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 774 /* (strict upper triangular part of A)*x */ 775 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 776 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 777 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 778 v += 9; 779 } 780 xb +=3; ai++; 781 } 782 783 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 784 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 785 786 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 787 PetscFunctionReturn(0); 788 } 789 790 #undef __FUNCT__ 791 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 792 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 793 { 794 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 795 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 796 MatScalar *v; 797 PetscErrorCode ierr; 798 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 799 PetscInt nonzerorow=0; 800 801 PetscFunctionBegin; 802 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 803 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 804 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 805 806 v = a->a; 807 xb = x; 808 809 for (i=0; i<mbs; i++) { 810 n = ai[1] - ai[0]; /* length of i_th block row of A */ 811 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 812 ib = aj + *ai; 813 jmin = 0; 814 nonzerorow += (n>0); 815 if (*ib == i){ /* (diag of A)*x */ 816 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 817 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 818 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 819 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 820 v += 16; jmin++; 821 } 822 for (j=jmin; j<n; j++) { 823 /* (strict lower triangular part of A)*x */ 824 cval = ib[j]*4; 825 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 826 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 827 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 828 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 829 /* (strict upper triangular part of A)*x */ 830 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 831 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 832 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 833 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 834 v += 16; 835 } 836 xb +=4; ai++; 837 } 838 839 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 840 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 841 842 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 843 PetscFunctionReturn(0); 844 } 845 846 #undef __FUNCT__ 847 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 848 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 849 { 850 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 851 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 852 MatScalar *v; 853 PetscErrorCode ierr; 854 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 855 PetscInt nonzerorow=0; 856 857 PetscFunctionBegin; 858 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 859 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 860 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 861 862 v = a->a; 863 xb = x; 864 865 for (i=0; i<mbs; i++) { 866 n = ai[1] - ai[0]; /* length of i_th block row of A */ 867 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 868 ib = aj + *ai; 869 jmin = 0; 870 nonzerorow += (n>0); 871 if (*ib == i){ /* (diag of A)*x */ 872 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 873 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 874 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 875 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 876 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 877 v += 25; jmin++; 878 } 879 for (j=jmin; j<n; j++) { 880 /* (strict lower triangular part of A)*x */ 881 cval = ib[j]*5; 882 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 883 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 884 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 885 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 886 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 887 /* (strict upper triangular part of A)*x */ 888 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 889 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 890 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 891 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 892 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 893 v += 25; 894 } 895 xb +=5; ai++; 896 } 897 898 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 899 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 900 901 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 902 PetscFunctionReturn(0); 903 } 904 #undef __FUNCT__ 905 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 906 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 907 { 908 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 909 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 910 MatScalar *v; 911 PetscErrorCode ierr; 912 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 913 PetscInt nonzerorow=0; 914 915 PetscFunctionBegin; 916 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 917 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 918 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 919 920 v = a->a; 921 xb = x; 922 923 for (i=0; i<mbs; i++) { 924 n = ai[1] - ai[0]; /* length of i_th block row of A */ 925 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 926 ib = aj + *ai; 927 jmin = 0; 928 nonzerorow += (n>0); 929 if (*ib == i){ /* (diag of A)*x */ 930 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 931 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 932 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 933 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 934 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 935 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 936 v += 36; jmin++; 937 } 938 for (j=jmin; j<n; j++) { 939 /* (strict lower triangular part of A)*x */ 940 cval = ib[j]*6; 941 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 942 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 943 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 944 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 945 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 946 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 947 /* (strict upper triangular part of A)*x */ 948 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 949 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 950 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 951 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 952 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 953 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 954 v += 36; 955 } 956 xb +=6; ai++; 957 } 958 959 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 960 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 961 962 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 963 PetscFunctionReturn(0); 964 } 965 966 #undef __FUNCT__ 967 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 968 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 969 { 970 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 971 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 972 MatScalar *v; 973 PetscErrorCode ierr; 974 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 975 PetscInt nonzerorow=0; 976 977 PetscFunctionBegin; 978 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 979 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 980 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 981 982 v = a->a; 983 xb = x; 984 985 for (i=0; i<mbs; i++) { 986 n = ai[1] - ai[0]; /* length of i_th block row of A */ 987 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 988 ib = aj + *ai; 989 jmin = 0; 990 nonzerorow += (n>0); 991 if (*ib == i){ /* (diag of A)*x */ 992 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 993 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 994 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 995 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 996 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 997 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 998 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 999 v += 49; jmin++; 1000 } 1001 for (j=jmin; j<n; j++) { 1002 /* (strict lower triangular part of A)*x */ 1003 cval = ib[j]*7; 1004 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1005 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1006 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1007 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1008 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1009 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1010 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1011 /* (strict upper triangular part of A)*x */ 1012 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1013 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1014 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1015 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1016 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1017 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1018 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1019 v += 49; 1020 } 1021 xb +=7; ai++; 1022 } 1023 1024 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1025 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1026 1027 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1028 PetscFunctionReturn(0); 1029 } 1030 1031 #undef __FUNCT__ 1032 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1033 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1034 { 1035 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1036 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1037 MatScalar *v; 1038 PetscErrorCode ierr; 1039 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1040 PetscInt nonzerorow=0; 1041 1042 PetscFunctionBegin; 1043 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1044 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1045 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1046 1047 aj = a->j; 1048 v = a->a; 1049 ii = a->i; 1050 1051 if (!a->mult_work) { 1052 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1053 } 1054 work = a->mult_work; 1055 1056 1057 for (i=0; i<mbs; i++) { 1058 n = ii[1] - ii[0]; ncols = n*bs; 1059 workt = work; idx=aj+ii[0]; 1060 nonzerorow += (n>0); 1061 1062 /* upper triangular part */ 1063 for (j=0; j<n; j++) { 1064 xb = x_ptr + bs*(*idx++); 1065 for (k=0; k<bs; k++) workt[k] = xb[k]; 1066 workt += bs; 1067 } 1068 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1069 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1070 1071 /* strict lower triangular part */ 1072 idx = aj+ii[0]; 1073 if (*idx == i){ 1074 ncols -= bs; v += bs2; idx++; n--; 1075 } 1076 if (ncols > 0){ 1077 workt = work; 1078 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1079 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1080 for (j=0; j<n; j++) { 1081 zb = z_ptr + bs*(*idx++); 1082 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1083 workt += bs; 1084 } 1085 } 1086 1087 x += bs; v += n*bs2; z += bs; ii++; 1088 } 1089 1090 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1092 1093 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1094 PetscFunctionReturn(0); 1095 } 1096 1097 #undef __FUNCT__ 1098 #define __FUNCT__ "MatScale_SeqSBAIJ" 1099 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1100 { 1101 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1102 PetscScalar oalpha = alpha; 1103 PetscErrorCode ierr; 1104 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1105 1106 PetscFunctionBegin; 1107 BLASscal_(&totalnz,&oalpha,a->a,&one); 1108 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1109 PetscFunctionReturn(0); 1110 } 1111 1112 #undef __FUNCT__ 1113 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1114 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1115 { 1116 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1117 MatScalar *v = a->a; 1118 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1119 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1120 PetscErrorCode ierr; 1121 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1122 1123 PetscFunctionBegin; 1124 if (type == NORM_FROBENIUS) { 1125 for (k=0; k<mbs; k++){ 1126 jmin = a->i[k]; jmax = a->i[k+1]; 1127 col = aj + jmin; 1128 if (*col == k){ /* diagonal block */ 1129 for (i=0; i<bs2; i++){ 1130 #if defined(PETSC_USE_COMPLEX) 1131 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1132 #else 1133 sum_diag += (*v)*(*v); v++; 1134 #endif 1135 } 1136 jmin++; 1137 } 1138 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1139 for (i=0; i<bs2; i++){ 1140 #if defined(PETSC_USE_COMPLEX) 1141 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1142 #else 1143 sum_off += (*v)*(*v); v++; 1144 #endif 1145 } 1146 } 1147 } 1148 *norm = sqrt(sum_diag + 2*sum_off); 1149 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1150 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1151 jl = il + mbs; 1152 sum = (PetscReal*)(jl + mbs); 1153 for (i=0; i<mbs; i++) jl[i] = mbs; 1154 il[0] = 0; 1155 1156 *norm = 0.0; 1157 for (k=0; k<mbs; k++) { /* k_th block row */ 1158 for (j=0; j<bs; j++) sum[j]=0.0; 1159 /*-- col sum --*/ 1160 i = jl[k]; /* first |A(i,k)| to be added */ 1161 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1162 at step k */ 1163 while (i<mbs){ 1164 nexti = jl[i]; /* next block row to be added */ 1165 ik = il[i]; /* block index of A(i,k) in the array a */ 1166 for (j=0; j<bs; j++){ 1167 v = a->a + ik*bs2 + j*bs; 1168 for (k1=0; k1<bs; k1++) { 1169 sum[j] += PetscAbsScalar(*v); v++; 1170 } 1171 } 1172 /* update il, jl */ 1173 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1174 jmax = a->i[i+1]; 1175 if (jmin < jmax){ 1176 il[i] = jmin; 1177 j = a->j[jmin]; 1178 jl[i] = jl[j]; jl[j]=i; 1179 } 1180 i = nexti; 1181 } 1182 /*-- row sum --*/ 1183 jmin = a->i[k]; jmax = a->i[k+1]; 1184 for (i=jmin; i<jmax; i++) { 1185 for (j=0; j<bs; j++){ 1186 v = a->a + i*bs2 + j; 1187 for (k1=0; k1<bs; k1++){ 1188 sum[j] += PetscAbsScalar(*v); v += bs; 1189 } 1190 } 1191 } 1192 /* add k_th block row to il, jl */ 1193 col = aj+jmin; 1194 if (*col == k) jmin++; 1195 if (jmin < jmax){ 1196 il[k] = jmin; 1197 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1198 } 1199 for (j=0; j<bs; j++){ 1200 if (sum[j] > *norm) *norm = sum[j]; 1201 } 1202 } 1203 ierr = PetscFree(il);CHKERRQ(ierr); 1204 } else { 1205 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1206 } 1207 PetscFunctionReturn(0); 1208 } 1209 1210 #undef __FUNCT__ 1211 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1212 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1213 { 1214 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1215 PetscErrorCode ierr; 1216 1217 PetscFunctionBegin; 1218 1219 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1220 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1221 *flg = PETSC_FALSE; 1222 PetscFunctionReturn(0); 1223 } 1224 1225 /* if the a->i are the same */ 1226 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1227 if (!*flg) { 1228 PetscFunctionReturn(0); 1229 } 1230 1231 /* if a->j are the same */ 1232 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1233 if (!*flg) { 1234 PetscFunctionReturn(0); 1235 } 1236 /* if a->a are the same */ 1237 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1238 PetscFunctionReturn(0); 1239 } 1240 1241 #undef __FUNCT__ 1242 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1243 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1244 { 1245 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1246 PetscErrorCode ierr; 1247 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1248 PetscScalar *x,zero = 0.0; 1249 MatScalar *aa,*aa_j; 1250 1251 PetscFunctionBegin; 1252 bs = A->rmap->bs; 1253 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1254 1255 aa = a->a; 1256 ai = a->i; 1257 aj = a->j; 1258 ambs = a->mbs; 1259 bs2 = a->bs2; 1260 1261 ierr = VecSet(v,zero);CHKERRQ(ierr); 1262 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1263 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1264 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1265 for (i=0; i<ambs; i++) { 1266 j=ai[i]; 1267 if (aj[j] == i) { /* if this is a diagonal element */ 1268 row = i*bs; 1269 aa_j = aa + j*bs2; 1270 if (A->factor && bs==1){ 1271 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1272 } else { 1273 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1274 } 1275 } 1276 } 1277 1278 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1279 PetscFunctionReturn(0); 1280 } 1281 1282 #undef __FUNCT__ 1283 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1284 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1285 { 1286 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1287 PetscScalar *l,x,*li,*ri; 1288 MatScalar *aa,*v; 1289 PetscErrorCode ierr; 1290 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1291 PetscTruth flg; 1292 1293 PetscFunctionBegin; 1294 if (ll != rr){ 1295 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1296 if (!flg) 1297 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1298 } 1299 if (!ll) PetscFunctionReturn(0); 1300 ai = a->i; 1301 aj = a->j; 1302 aa = a->a; 1303 m = A->rmap->N; 1304 bs = A->rmap->bs; 1305 mbs = a->mbs; 1306 bs2 = a->bs2; 1307 1308 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1309 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1310 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1311 for (i=0; i<mbs; i++) { /* for each block row */ 1312 M = ai[i+1] - ai[i]; 1313 li = l + i*bs; 1314 v = aa + bs2*ai[i]; 1315 for (j=0; j<M; j++) { /* for each block */ 1316 ri = l + bs*aj[ai[i]+j]; 1317 for (k=0; k<bs; k++) { 1318 x = ri[k]; 1319 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1320 } 1321 } 1322 } 1323 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1324 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1325 PetscFunctionReturn(0); 1326 } 1327 1328 #undef __FUNCT__ 1329 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1330 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1331 { 1332 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1333 1334 PetscFunctionBegin; 1335 info->block_size = a->bs2; 1336 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1337 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1338 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1339 info->assemblies = A->num_ass; 1340 info->mallocs = a->reallocs; 1341 info->memory = ((PetscObject)A)->mem; 1342 if (A->factor) { 1343 info->fill_ratio_given = A->info.fill_ratio_given; 1344 info->fill_ratio_needed = A->info.fill_ratio_needed; 1345 info->factor_mallocs = A->info.factor_mallocs; 1346 } else { 1347 info->fill_ratio_given = 0; 1348 info->fill_ratio_needed = 0; 1349 info->factor_mallocs = 0; 1350 } 1351 PetscFunctionReturn(0); 1352 } 1353 1354 1355 #undef __FUNCT__ 1356 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1357 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1358 { 1359 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1360 PetscErrorCode ierr; 1361 1362 PetscFunctionBegin; 1363 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1364 PetscFunctionReturn(0); 1365 } 1366 1367 #undef __FUNCT__ 1368 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1369 /* 1370 This code does not work since it only checks the upper triangular part of 1371 the matrix. Hence it is not listed in the function table. 1372 */ 1373 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1374 { 1375 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1376 PetscErrorCode ierr; 1377 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1378 PetscReal atmp; 1379 MatScalar *aa; 1380 PetscScalar *x; 1381 PetscInt ncols,brow,bcol,krow,kcol; 1382 1383 PetscFunctionBegin; 1384 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1385 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1386 bs = A->rmap->bs; 1387 aa = a->a; 1388 ai = a->i; 1389 aj = a->j; 1390 mbs = a->mbs; 1391 1392 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1393 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1394 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1395 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1396 for (i=0; i<mbs; i++) { 1397 ncols = ai[1] - ai[0]; ai++; 1398 brow = bs*i; 1399 for (j=0; j<ncols; j++){ 1400 bcol = bs*(*aj); 1401 for (kcol=0; kcol<bs; kcol++){ 1402 col = bcol + kcol; /* col index */ 1403 for (krow=0; krow<bs; krow++){ 1404 atmp = PetscAbsScalar(*aa); aa++; 1405 row = brow + krow; /* row index */ 1406 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1407 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1408 } 1409 } 1410 aj++; 1411 } 1412 } 1413 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1414 PetscFunctionReturn(0); 1415 } 1416