1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "../src/mat/impls/sbaij/seq/sbaij.h" 7 #include "petscblaslapack.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 const PetscInt *idx; 17 PetscBT table,table0; 18 19 PetscFunctionBegin; 20 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 21 mbs = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 26 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 27 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 28 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { /* for each is */ 31 isz = 0; 32 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 33 34 /* Extract the indices, assume there can be duplicate entries */ 35 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 36 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 37 38 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 39 bcol_max = 0; 40 for (j=0; j<n ; ++j){ 41 brow = idx[j]/bs; /* convert the indices into block indices */ 42 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,brow)) { 44 nidx[isz++] = brow; 45 if (bcol_max < brow) bcol_max = brow; 46 } 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++){ /* for each overlap */ 53 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 54 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 55 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 56 57 n = isz; /* length of the updated is[i] */ 58 for (brow=0; brow<mbs; brow++){ 59 start = ai[brow]; end = ai[brow+1]; 60 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 61 for (l = start; l<end ; l++){ 62 bcol = aj[l]; 63 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end ; l++){ 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table0,bcol)){ 72 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) 83 nidx2[j*bs+k] = nidx[j]*bs+k; 84 } 85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 86 } 87 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 88 ierr = PetscFree(nidx);CHKERRQ(ierr); 89 ierr = PetscFree(nidx2);CHKERRQ(ierr); 90 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 #undef __FUNCT__ 95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 97 { 98 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 99 PetscErrorCode ierr; 100 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 101 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 102 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 103 const PetscInt *irow,*aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscTruth flag,sorted; 107 108 PetscFunctionBegin; 109 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 110 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 111 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 117 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 120 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 121 /* determine lens of each row */ 122 for (i=0; i<nrows; i++) { 123 kstart = ai[irow[i]]; 124 kend = kstart + a->ilen[irow[i]]; 125 lens[i] = 0; 126 for (k=kstart; k<kend; k++) { 127 if (ssmap[aj[k]]) { 128 lens[i]++; 129 } 130 } 131 } 132 /* Create and fill new matrix */ 133 if (scall == MAT_REUSE_MATRIX) { 134 c = (Mat_SeqSBAIJ *)((*B)->data); 135 136 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 137 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 138 if (!flag) { 139 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 140 } 141 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 142 C = *B; 143 } else { 144 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 145 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 146 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 147 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 148 } 149 c = (Mat_SeqSBAIJ *)(C->data); 150 for (i=0; i<nrows; i++) { 151 row = irow[i]; 152 kstart = ai[row]; 153 kend = kstart + a->ilen[row]; 154 mat_i = c->i[i]; 155 mat_j = c->j + mat_i; 156 mat_a = c->a + mat_i*bs2; 157 mat_ilen = c->ilen + i; 158 for (k=kstart; k<kend; k++) { 159 if ((tcol=ssmap[a->j[k]])) { 160 *mat_j++ = tcol - 1; 161 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 162 mat_a += bs2; 163 (*mat_ilen)++; 164 } 165 } 166 } 167 168 /* Free work space */ 169 ierr = PetscFree(smap);CHKERRQ(ierr); 170 ierr = PetscFree(lens);CHKERRQ(ierr); 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 174 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 175 *B = C; 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 182 { 183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 184 IS is1; 185 PetscErrorCode ierr; 186 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 187 const PetscInt *irow; 188 189 PetscFunctionBegin; 190 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 191 192 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 193 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 194 195 /* Verify if the indices corespond to each element in a block 196 and form the IS with compressed IS */ 197 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 198 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 199 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 200 201 count = 0; 202 for (i=0; i<a->mbs; i++) { 203 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 204 if (vary[i]==bs) iary[count++] = i; 205 } 206 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 207 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 208 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 211 ierr = ISDestroy(is1);CHKERRQ(ierr); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 PetscErrorCode ierr; 220 PetscInt i; 221 222 PetscFunctionBegin; 223 if (scall == MAT_INITIAL_MATRIX) { 224 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 225 } 226 227 for (i=0; i<n; i++) { 228 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 /* -------------------------------------------------------*/ 234 /* Should check that shapes of vectors and matrices match */ 235 /* -------------------------------------------------------*/ 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 239 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 240 { 241 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 242 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 243 MatScalar *v; 244 PetscErrorCode ierr; 245 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 246 PetscInt nonzerorow=0; 247 248 PetscFunctionBegin; 249 ierr = VecSet(zz,zero);CHKERRQ(ierr); 250 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 251 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 252 253 v = a->a; 254 xb = x; 255 256 for (i=0; i<mbs; i++) { 257 n = ai[1] - ai[0]; /* length of i_th block row of A */ 258 x1 = xb[0]; x2 = xb[1]; 259 ib = aj + *ai; 260 jmin = 0; 261 nonzerorow += (n>0); 262 if (*ib == i){ /* (diag of A)*x */ 263 z[2*i] += v[0]*x1 + v[2]*x2; 264 z[2*i+1] += v[2]*x1 + v[3]*x2; 265 v += 4; jmin++; 266 } 267 for (j=jmin; j<n; j++) { 268 /* (strict lower triangular part of A)*x */ 269 cval = ib[j]*2; 270 z[cval] += v[0]*x1 + v[1]*x2; 271 z[cval+1] += v[2]*x1 + v[3]*x2; 272 /* (strict upper triangular part of A)*x */ 273 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 274 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 275 v += 4; 276 } 277 xb +=2; ai++; 278 } 279 280 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 281 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 282 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 283 PetscFunctionReturn(0); 284 } 285 286 #undef __FUNCT__ 287 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 288 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 289 { 290 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 291 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 292 MatScalar *v; 293 PetscErrorCode ierr; 294 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 295 PetscInt nonzerorow=0; 296 297 PetscFunctionBegin; 298 ierr = VecSet(zz,zero);CHKERRQ(ierr); 299 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 300 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 301 302 v = a->a; 303 xb = x; 304 305 for (i=0; i<mbs; i++) { 306 n = ai[1] - ai[0]; /* length of i_th block row of A */ 307 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 308 ib = aj + *ai; 309 jmin = 0; 310 nonzerorow += (n>0); 311 if (*ib == i){ /* (diag of A)*x */ 312 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 313 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 314 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 315 v += 9; jmin++; 316 } 317 for (j=jmin; j<n; j++) { 318 /* (strict lower triangular part of A)*x */ 319 cval = ib[j]*3; 320 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 321 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 322 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 323 /* (strict upper triangular part of A)*x */ 324 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 325 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 326 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 327 v += 9; 328 } 329 xb +=3; ai++; 330 } 331 332 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 333 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 334 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 335 PetscFunctionReturn(0); 336 } 337 338 #undef __FUNCT__ 339 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 340 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 341 { 342 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 343 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 344 MatScalar *v; 345 PetscErrorCode ierr; 346 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 347 PetscInt nonzerorow=0; 348 349 PetscFunctionBegin; 350 ierr = VecSet(zz,zero);CHKERRQ(ierr); 351 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 352 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 353 354 v = a->a; 355 xb = x; 356 357 for (i=0; i<mbs; i++) { 358 n = ai[1] - ai[0]; /* length of i_th block row of A */ 359 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 360 ib = aj + *ai; 361 jmin = 0; 362 nonzerorow += (n>0); 363 if (*ib == i){ /* (diag of A)*x */ 364 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 365 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 366 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 367 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 368 v += 16; jmin++; 369 } 370 for (j=jmin; j<n; j++) { 371 /* (strict lower triangular part of A)*x */ 372 cval = ib[j]*4; 373 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 374 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 375 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 376 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 377 /* (strict upper triangular part of A)*x */ 378 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 379 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 380 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 381 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 382 v += 16; 383 } 384 xb +=4; ai++; 385 } 386 387 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 388 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 389 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 395 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 396 { 397 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 398 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 399 MatScalar *v; 400 PetscErrorCode ierr; 401 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 402 PetscInt nonzerorow=0; 403 404 PetscFunctionBegin; 405 ierr = VecSet(zz,zero);CHKERRQ(ierr); 406 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 407 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 408 409 v = a->a; 410 xb = x; 411 412 for (i=0; i<mbs; i++) { 413 n = ai[1] - ai[0]; /* length of i_th block row of A */ 414 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 415 ib = aj + *ai; 416 jmin = 0; 417 nonzerorow += (n>0); 418 if (*ib == i){ /* (diag of A)*x */ 419 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 420 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 421 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 422 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 423 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 424 v += 25; jmin++; 425 } 426 for (j=jmin; j<n; j++) { 427 /* (strict lower triangular part of A)*x */ 428 cval = ib[j]*5; 429 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 430 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 431 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 432 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 433 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 434 /* (strict upper triangular part of A)*x */ 435 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 436 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 437 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 438 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 439 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 440 v += 25; 441 } 442 xb +=5; ai++; 443 } 444 445 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 446 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 447 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 448 PetscFunctionReturn(0); 449 } 450 451 452 #undef __FUNCT__ 453 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 454 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 455 { 456 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 457 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 458 MatScalar *v; 459 PetscErrorCode ierr; 460 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 461 PetscInt nonzerorow=0; 462 463 PetscFunctionBegin; 464 ierr = VecSet(zz,zero);CHKERRQ(ierr); 465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 466 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 467 468 v = a->a; 469 xb = x; 470 471 for (i=0; i<mbs; i++) { 472 n = ai[1] - ai[0]; /* length of i_th block row of A */ 473 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 474 ib = aj + *ai; 475 jmin = 0; 476 nonzerorow += (n>0); 477 if (*ib == i){ /* (diag of A)*x */ 478 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 479 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 480 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 481 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 482 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 483 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 484 v += 36; jmin++; 485 } 486 for (j=jmin; j<n; j++) { 487 /* (strict lower triangular part of A)*x */ 488 cval = ib[j]*6; 489 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 490 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 491 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 492 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 493 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 494 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 495 /* (strict upper triangular part of A)*x */ 496 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 497 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 498 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 499 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 500 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 501 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 502 v += 36; 503 } 504 xb +=6; ai++; 505 } 506 507 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 508 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 509 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 510 PetscFunctionReturn(0); 511 } 512 #undef __FUNCT__ 513 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 514 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 515 { 516 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 517 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 518 MatScalar *v; 519 PetscErrorCode ierr; 520 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 521 PetscInt nonzerorow=0; 522 523 PetscFunctionBegin; 524 ierr = VecSet(zz,zero);CHKERRQ(ierr); 525 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 526 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 527 528 v = a->a; 529 xb = x; 530 531 for (i=0; i<mbs; i++) { 532 n = ai[1] - ai[0]; /* length of i_th block row of A */ 533 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 534 ib = aj + *ai; 535 jmin = 0; 536 nonzerorow += (n>0); 537 if (*ib == i){ /* (diag of A)*x */ 538 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 539 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 540 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 541 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 542 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 543 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 544 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 545 v += 49; jmin++; 546 } 547 for (j=jmin; j<n; j++) { 548 /* (strict lower triangular part of A)*x */ 549 cval = ib[j]*7; 550 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 551 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 552 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 553 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 554 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 555 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 556 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 557 /* (strict upper triangular part of A)*x */ 558 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 559 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 560 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 561 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 562 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 563 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 564 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 565 v += 49; 566 } 567 xb +=7; ai++; 568 } 569 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 570 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 571 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 572 PetscFunctionReturn(0); 573 } 574 575 /* 576 This will not work with MatScalar == float because it calls the BLAS 577 */ 578 #undef __FUNCT__ 579 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 580 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 581 { 582 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 583 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 584 MatScalar *v; 585 PetscErrorCode ierr; 586 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 587 PetscInt nonzerorow=0; 588 589 PetscFunctionBegin; 590 ierr = VecSet(zz,zero);CHKERRQ(ierr); 591 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 592 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 593 594 aj = a->j; 595 v = a->a; 596 ii = a->i; 597 598 if (!a->mult_work) { 599 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 600 } 601 work = a->mult_work; 602 603 for (i=0; i<mbs; i++) { 604 n = ii[1] - ii[0]; ncols = n*bs; 605 workt = work; idx=aj+ii[0]; 606 nonzerorow += (n>0); 607 608 /* upper triangular part */ 609 for (j=0; j<n; j++) { 610 xb = x_ptr + bs*(*idx++); 611 for (k=0; k<bs; k++) workt[k] = xb[k]; 612 workt += bs; 613 } 614 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 615 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 616 617 /* strict lower triangular part */ 618 idx = aj+ii[0]; 619 if (*idx == i){ 620 ncols -= bs; v += bs2; idx++; n--; 621 } 622 623 if (ncols > 0){ 624 workt = work; 625 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 626 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 627 for (j=0; j<n; j++) { 628 zb = z_ptr + bs*(*idx++); 629 for (k=0; k<bs; k++) zb[k] += workt[k] ; 630 workt += bs; 631 } 632 } 633 x += bs; v += n*bs2; z += bs; ii++; 634 } 635 636 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 638 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 639 PetscFunctionReturn(0); 640 } 641 642 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 643 #undef __FUNCT__ 644 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 645 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 646 { 647 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 648 PetscScalar *x,*z,*xb,x1; 649 MatScalar *v; 650 PetscErrorCode ierr; 651 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 652 PetscInt nonzerorow=0; 653 654 PetscFunctionBegin; 655 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 656 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 657 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 658 v = a->a; 659 xb = x; 660 661 for (i=0; i<mbs; i++) { 662 n = ai[1] - ai[0]; /* length of i_th row of A */ 663 x1 = xb[0]; 664 ib = aj + *ai; 665 jmin = 0; 666 nonzerorow += (n>0); 667 if (*ib == i) { /* (diag of A)*x */ 668 z[i] += *v++ * x[*ib++]; jmin++; 669 } 670 for (j=jmin; j<n; j++) { 671 cval = *ib; 672 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 673 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 674 } 675 xb++; ai++; 676 } 677 678 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 679 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 680 681 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 #undef __FUNCT__ 686 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 687 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 688 { 689 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 690 PetscScalar *x,*z,*xb,x1,x2; 691 MatScalar *v; 692 PetscErrorCode ierr; 693 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 694 PetscInt nonzerorow=0; 695 696 PetscFunctionBegin; 697 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 698 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 699 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 700 701 v = a->a; 702 xb = x; 703 704 for (i=0; i<mbs; i++) { 705 n = ai[1] - ai[0]; /* length of i_th block row of A */ 706 x1 = xb[0]; x2 = xb[1]; 707 ib = aj + *ai; 708 jmin = 0; 709 nonzerorow += (n>0); 710 if (*ib == i){ /* (diag of A)*x */ 711 z[2*i] += v[0]*x1 + v[2]*x2; 712 z[2*i+1] += v[2]*x1 + v[3]*x2; 713 v += 4; jmin++; 714 } 715 for (j=jmin; j<n; j++) { 716 /* (strict lower triangular part of A)*x */ 717 cval = ib[j]*2; 718 z[cval] += v[0]*x1 + v[1]*x2; 719 z[cval+1] += v[2]*x1 + v[3]*x2; 720 /* (strict upper triangular part of A)*x */ 721 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 722 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 723 v += 4; 724 } 725 xb +=2; ai++; 726 } 727 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 728 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 729 730 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 731 PetscFunctionReturn(0); 732 } 733 734 #undef __FUNCT__ 735 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 736 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 737 { 738 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 739 PetscScalar *x,*z,*xb,x1,x2,x3; 740 MatScalar *v; 741 PetscErrorCode ierr; 742 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 743 PetscInt nonzerorow=0; 744 745 PetscFunctionBegin; 746 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 747 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 748 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 749 750 v = a->a; 751 xb = x; 752 753 for (i=0; i<mbs; i++) { 754 n = ai[1] - ai[0]; /* length of i_th block row of A */ 755 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 756 ib = aj + *ai; 757 jmin = 0; 758 nonzerorow += (n>0); 759 if (*ib == i){ /* (diag of A)*x */ 760 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 761 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 762 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 763 v += 9; jmin++; 764 } 765 for (j=jmin; j<n; j++) { 766 /* (strict lower triangular part of A)*x */ 767 cval = ib[j]*3; 768 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 769 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 770 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 771 /* (strict upper triangular part of A)*x */ 772 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 773 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 774 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 775 v += 9; 776 } 777 xb +=3; ai++; 778 } 779 780 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 781 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 782 783 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 784 PetscFunctionReturn(0); 785 } 786 787 #undef __FUNCT__ 788 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 789 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 790 { 791 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 792 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 793 MatScalar *v; 794 PetscErrorCode ierr; 795 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 796 PetscInt nonzerorow=0; 797 798 PetscFunctionBegin; 799 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 802 803 v = a->a; 804 xb = x; 805 806 for (i=0; i<mbs; i++) { 807 n = ai[1] - ai[0]; /* length of i_th block row of A */ 808 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 809 ib = aj + *ai; 810 jmin = 0; 811 nonzerorow += (n>0); 812 if (*ib == i){ /* (diag of A)*x */ 813 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 814 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 815 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 816 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 817 v += 16; jmin++; 818 } 819 for (j=jmin; j<n; j++) { 820 /* (strict lower triangular part of A)*x */ 821 cval = ib[j]*4; 822 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 823 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 824 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 825 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 826 /* (strict upper triangular part of A)*x */ 827 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 828 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 829 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 830 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 831 v += 16; 832 } 833 xb +=4; ai++; 834 } 835 836 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 837 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 838 839 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 840 PetscFunctionReturn(0); 841 } 842 843 #undef __FUNCT__ 844 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 845 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 846 { 847 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 848 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 849 MatScalar *v; 850 PetscErrorCode ierr; 851 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 852 PetscInt nonzerorow=0; 853 854 PetscFunctionBegin; 855 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 856 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 857 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 858 859 v = a->a; 860 xb = x; 861 862 for (i=0; i<mbs; i++) { 863 n = ai[1] - ai[0]; /* length of i_th block row of A */ 864 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 865 ib = aj + *ai; 866 jmin = 0; 867 nonzerorow += (n>0); 868 if (*ib == i){ /* (diag of A)*x */ 869 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 870 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 871 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 872 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 873 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 874 v += 25; jmin++; 875 } 876 for (j=jmin; j<n; j++) { 877 /* (strict lower triangular part of A)*x */ 878 cval = ib[j]*5; 879 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 880 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 881 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 882 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 883 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 884 /* (strict upper triangular part of A)*x */ 885 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 886 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 887 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 888 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 889 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 890 v += 25; 891 } 892 xb +=5; ai++; 893 } 894 895 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 896 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 897 898 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 899 PetscFunctionReturn(0); 900 } 901 #undef __FUNCT__ 902 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 903 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 904 { 905 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 906 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 907 MatScalar *v; 908 PetscErrorCode ierr; 909 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 910 PetscInt nonzerorow=0; 911 912 PetscFunctionBegin; 913 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 914 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 915 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 916 917 v = a->a; 918 xb = x; 919 920 for (i=0; i<mbs; i++) { 921 n = ai[1] - ai[0]; /* length of i_th block row of A */ 922 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 923 ib = aj + *ai; 924 jmin = 0; 925 nonzerorow += (n>0); 926 if (*ib == i){ /* (diag of A)*x */ 927 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 928 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 929 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 930 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 931 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 932 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 933 v += 36; jmin++; 934 } 935 for (j=jmin; j<n; j++) { 936 /* (strict lower triangular part of A)*x */ 937 cval = ib[j]*6; 938 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 939 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 940 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 941 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 942 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 943 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 944 /* (strict upper triangular part of A)*x */ 945 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 946 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 947 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 948 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 949 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 950 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 951 v += 36; 952 } 953 xb +=6; ai++; 954 } 955 956 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 957 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 958 959 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 960 PetscFunctionReturn(0); 961 } 962 963 #undef __FUNCT__ 964 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 965 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 966 { 967 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 968 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 969 MatScalar *v; 970 PetscErrorCode ierr; 971 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 972 PetscInt nonzerorow=0; 973 974 PetscFunctionBegin; 975 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 976 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 977 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 978 979 v = a->a; 980 xb = x; 981 982 for (i=0; i<mbs; i++) { 983 n = ai[1] - ai[0]; /* length of i_th block row of A */ 984 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 985 ib = aj + *ai; 986 jmin = 0; 987 nonzerorow += (n>0); 988 if (*ib == i){ /* (diag of A)*x */ 989 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 990 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 991 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 992 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 993 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 994 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 995 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 996 v += 49; jmin++; 997 } 998 for (j=jmin; j<n; j++) { 999 /* (strict lower triangular part of A)*x */ 1000 cval = ib[j]*7; 1001 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1002 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1003 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1004 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1005 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1006 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1007 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1008 /* (strict upper triangular part of A)*x */ 1009 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1010 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1011 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1012 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1013 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1014 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1015 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1016 v += 49; 1017 } 1018 xb +=7; ai++; 1019 } 1020 1021 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1022 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1023 1024 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1025 PetscFunctionReturn(0); 1026 } 1027 1028 #undef __FUNCT__ 1029 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1030 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1031 { 1032 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1033 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1034 MatScalar *v; 1035 PetscErrorCode ierr; 1036 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1037 PetscInt nonzerorow=0; 1038 1039 PetscFunctionBegin; 1040 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1041 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1042 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1043 1044 aj = a->j; 1045 v = a->a; 1046 ii = a->i; 1047 1048 if (!a->mult_work) { 1049 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1050 } 1051 work = a->mult_work; 1052 1053 1054 for (i=0; i<mbs; i++) { 1055 n = ii[1] - ii[0]; ncols = n*bs; 1056 workt = work; idx=aj+ii[0]; 1057 nonzerorow += (n>0); 1058 1059 /* upper triangular part */ 1060 for (j=0; j<n; j++) { 1061 xb = x_ptr + bs*(*idx++); 1062 for (k=0; k<bs; k++) workt[k] = xb[k]; 1063 workt += bs; 1064 } 1065 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1066 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1067 1068 /* strict lower triangular part */ 1069 idx = aj+ii[0]; 1070 if (*idx == i){ 1071 ncols -= bs; v += bs2; idx++; n--; 1072 } 1073 if (ncols > 0){ 1074 workt = work; 1075 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1076 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1077 for (j=0; j<n; j++) { 1078 zb = z_ptr + bs*(*idx++); 1079 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1080 workt += bs; 1081 } 1082 } 1083 1084 x += bs; v += n*bs2; z += bs; ii++; 1085 } 1086 1087 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1088 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1089 1090 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1091 PetscFunctionReturn(0); 1092 } 1093 1094 #undef __FUNCT__ 1095 #define __FUNCT__ "MatScale_SeqSBAIJ" 1096 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1097 { 1098 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1099 PetscScalar oalpha = alpha; 1100 PetscErrorCode ierr; 1101 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1102 1103 PetscFunctionBegin; 1104 BLASscal_(&totalnz,&oalpha,a->a,&one); 1105 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1106 PetscFunctionReturn(0); 1107 } 1108 1109 #undef __FUNCT__ 1110 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1111 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1112 { 1113 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1114 MatScalar *v = a->a; 1115 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1116 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1117 PetscErrorCode ierr; 1118 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1119 1120 PetscFunctionBegin; 1121 if (type == NORM_FROBENIUS) { 1122 for (k=0; k<mbs; k++){ 1123 jmin = a->i[k]; jmax = a->i[k+1]; 1124 col = aj + jmin; 1125 if (*col == k){ /* diagonal block */ 1126 for (i=0; i<bs2; i++){ 1127 #if defined(PETSC_USE_COMPLEX) 1128 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1129 #else 1130 sum_diag += (*v)*(*v); v++; 1131 #endif 1132 } 1133 jmin++; 1134 } 1135 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1136 for (i=0; i<bs2; i++){ 1137 #if defined(PETSC_USE_COMPLEX) 1138 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1139 #else 1140 sum_off += (*v)*(*v); v++; 1141 #endif 1142 } 1143 } 1144 } 1145 *norm = sqrt(sum_diag + 2*sum_off); 1146 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1147 ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 1148 for (i=0; i<mbs; i++) jl[i] = mbs; 1149 il[0] = 0; 1150 1151 *norm = 0.0; 1152 for (k=0; k<mbs; k++) { /* k_th block row */ 1153 for (j=0; j<bs; j++) sum[j]=0.0; 1154 /*-- col sum --*/ 1155 i = jl[k]; /* first |A(i,k)| to be added */ 1156 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1157 at step k */ 1158 while (i<mbs){ 1159 nexti = jl[i]; /* next block row to be added */ 1160 ik = il[i]; /* block index of A(i,k) in the array a */ 1161 for (j=0; j<bs; j++){ 1162 v = a->a + ik*bs2 + j*bs; 1163 for (k1=0; k1<bs; k1++) { 1164 sum[j] += PetscAbsScalar(*v); v++; 1165 } 1166 } 1167 /* update il, jl */ 1168 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1169 jmax = a->i[i+1]; 1170 if (jmin < jmax){ 1171 il[i] = jmin; 1172 j = a->j[jmin]; 1173 jl[i] = jl[j]; jl[j]=i; 1174 } 1175 i = nexti; 1176 } 1177 /*-- row sum --*/ 1178 jmin = a->i[k]; jmax = a->i[k+1]; 1179 for (i=jmin; i<jmax; i++) { 1180 for (j=0; j<bs; j++){ 1181 v = a->a + i*bs2 + j; 1182 for (k1=0; k1<bs; k1++){ 1183 sum[j] += PetscAbsScalar(*v); v += bs; 1184 } 1185 } 1186 } 1187 /* add k_th block row to il, jl */ 1188 col = aj+jmin; 1189 if (*col == k) jmin++; 1190 if (jmin < jmax){ 1191 il[k] = jmin; 1192 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1193 } 1194 for (j=0; j<bs; j++){ 1195 if (sum[j] > *norm) *norm = sum[j]; 1196 } 1197 } 1198 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1199 } else { 1200 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1201 } 1202 PetscFunctionReturn(0); 1203 } 1204 1205 #undef __FUNCT__ 1206 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1207 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1208 { 1209 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1210 PetscErrorCode ierr; 1211 1212 PetscFunctionBegin; 1213 1214 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1215 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1216 *flg = PETSC_FALSE; 1217 PetscFunctionReturn(0); 1218 } 1219 1220 /* if the a->i are the same */ 1221 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1222 if (!*flg) { 1223 PetscFunctionReturn(0); 1224 } 1225 1226 /* if a->j are the same */ 1227 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1228 if (!*flg) { 1229 PetscFunctionReturn(0); 1230 } 1231 /* if a->a are the same */ 1232 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1233 PetscFunctionReturn(0); 1234 } 1235 1236 #undef __FUNCT__ 1237 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1238 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1239 { 1240 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1241 PetscErrorCode ierr; 1242 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1243 PetscScalar *x,zero = 0.0; 1244 MatScalar *aa,*aa_j; 1245 1246 PetscFunctionBegin; 1247 bs = A->rmap->bs; 1248 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1249 1250 aa = a->a; 1251 ai = a->i; 1252 aj = a->j; 1253 ambs = a->mbs; 1254 bs2 = a->bs2; 1255 1256 ierr = VecSet(v,zero);CHKERRQ(ierr); 1257 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1258 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1259 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1260 for (i=0; i<ambs; i++) { 1261 j=ai[i]; 1262 if (aj[j] == i) { /* if this is a diagonal element */ 1263 row = i*bs; 1264 aa_j = aa + j*bs2; 1265 if (A->factor && bs==1){ 1266 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1267 } else { 1268 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1269 } 1270 } 1271 } 1272 1273 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1274 PetscFunctionReturn(0); 1275 } 1276 1277 #undef __FUNCT__ 1278 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1279 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1280 { 1281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1282 PetscScalar *l,x,*li,*ri; 1283 MatScalar *aa,*v; 1284 PetscErrorCode ierr; 1285 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1286 PetscTruth flg; 1287 1288 PetscFunctionBegin; 1289 if (ll != rr){ 1290 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1291 if (!flg) 1292 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1293 } 1294 if (!ll) PetscFunctionReturn(0); 1295 ai = a->i; 1296 aj = a->j; 1297 aa = a->a; 1298 m = A->rmap->N; 1299 bs = A->rmap->bs; 1300 mbs = a->mbs; 1301 bs2 = a->bs2; 1302 1303 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1304 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1305 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1306 for (i=0; i<mbs; i++) { /* for each block row */ 1307 M = ai[i+1] - ai[i]; 1308 li = l + i*bs; 1309 v = aa + bs2*ai[i]; 1310 for (j=0; j<M; j++) { /* for each block */ 1311 ri = l + bs*aj[ai[i]+j]; 1312 for (k=0; k<bs; k++) { 1313 x = ri[k]; 1314 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1315 } 1316 } 1317 } 1318 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1319 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1320 PetscFunctionReturn(0); 1321 } 1322 1323 #undef __FUNCT__ 1324 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1325 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1326 { 1327 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1328 1329 PetscFunctionBegin; 1330 info->block_size = a->bs2; 1331 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1332 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1333 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1334 info->assemblies = A->num_ass; 1335 info->mallocs = a->reallocs; 1336 info->memory = ((PetscObject)A)->mem; 1337 if (A->factor) { 1338 info->fill_ratio_given = A->info.fill_ratio_given; 1339 info->fill_ratio_needed = A->info.fill_ratio_needed; 1340 info->factor_mallocs = A->info.factor_mallocs; 1341 } else { 1342 info->fill_ratio_given = 0; 1343 info->fill_ratio_needed = 0; 1344 info->factor_mallocs = 0; 1345 } 1346 PetscFunctionReturn(0); 1347 } 1348 1349 1350 #undef __FUNCT__ 1351 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1352 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1353 { 1354 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1355 PetscErrorCode ierr; 1356 1357 PetscFunctionBegin; 1358 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1359 PetscFunctionReturn(0); 1360 } 1361 1362 #undef __FUNCT__ 1363 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1364 /* 1365 This code does not work since it only checks the upper triangular part of 1366 the matrix. Hence it is not listed in the function table. 1367 */ 1368 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1369 { 1370 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1371 PetscErrorCode ierr; 1372 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1373 PetscReal atmp; 1374 MatScalar *aa; 1375 PetscScalar *x; 1376 PetscInt ncols,brow,bcol,krow,kcol; 1377 1378 PetscFunctionBegin; 1379 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1380 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1381 bs = A->rmap->bs; 1382 aa = a->a; 1383 ai = a->i; 1384 aj = a->j; 1385 mbs = a->mbs; 1386 1387 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1388 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1389 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1390 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1391 for (i=0; i<mbs; i++) { 1392 ncols = ai[1] - ai[0]; ai++; 1393 brow = bs*i; 1394 for (j=0; j<ncols; j++){ 1395 bcol = bs*(*aj); 1396 for (kcol=0; kcol<bs; kcol++){ 1397 col = bcol + kcol; /* col index */ 1398 for (krow=0; krow<bs; krow++){ 1399 atmp = PetscAbsScalar(*aa); aa++; 1400 row = brow + krow; /* row index */ 1401 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1402 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1403 } 1404 } 1405 aj++; 1406 } 1407 } 1408 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1409 PetscFunctionReturn(0); 1410 } 1411