1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "../src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 102 const PetscInt *irow,*aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag,sorted; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 110 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 117 ssmap = smap; 118 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 144 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 145 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 146 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 147 } 148 c = (Mat_SeqSBAIJ *)(C->data); 149 for (i=0; i<nrows; i++) { 150 row = irow[i]; 151 kstart = ai[row]; 152 kend = kstart + a->ilen[row]; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 mat_ilen = c->ilen + i; 157 for (k=kstart; k<kend; k++) { 158 if ((tcol=ssmap[a->j[k]])) { 159 *mat_j++ = tcol - 1; 160 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 161 mat_a += bs2; 162 (*mat_ilen)++; 163 } 164 } 165 } 166 167 /* Free work space */ 168 ierr = PetscFree(smap);CHKERRQ(ierr); 169 ierr = PetscFree(lens);CHKERRQ(ierr); 170 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 173 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 174 *B = C; 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 181 { 182 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 183 IS is1; 184 PetscErrorCode ierr; 185 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 186 const PetscInt *irow; 187 188 PetscFunctionBegin; 189 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 190 191 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 192 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 193 194 /* Verify if the indices corespond to each element in a block 195 and form the IS with compressed IS */ 196 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 197 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 198 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 199 200 count = 0; 201 for (i=0; i<a->mbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 203 if (vary[i]==bs) iary[count++] = i; 204 } 205 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 206 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 207 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 208 209 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 210 ierr = ISDestroy(is1);CHKERRQ(ierr); 211 PetscFunctionReturn(0); 212 } 213 214 #undef __FUNCT__ 215 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 216 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 217 { 218 PetscErrorCode ierr; 219 PetscInt i; 220 221 PetscFunctionBegin; 222 if (scall == MAT_INITIAL_MATRIX) { 223 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 224 } 225 226 for (i=0; i<n; i++) { 227 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 228 } 229 PetscFunctionReturn(0); 230 } 231 232 /* -------------------------------------------------------*/ 233 /* Should check that shapes of vectors and matrices match */ 234 /* -------------------------------------------------------*/ 235 #include "petscblaslapack.h" 236 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 240 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 241 { 242 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 243 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 244 MatScalar *v; 245 PetscErrorCode ierr; 246 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 247 PetscInt nonzerorow=0; 248 249 PetscFunctionBegin; 250 ierr = VecSet(zz,zero);CHKERRQ(ierr); 251 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 252 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 253 254 v = a->a; 255 xb = x; 256 257 for (i=0; i<mbs; i++) { 258 n = ai[1] - ai[0]; /* length of i_th block row of A */ 259 x1 = xb[0]; x2 = xb[1]; 260 ib = aj + *ai; 261 jmin = 0; 262 nonzerorow += (n>0); 263 if (*ib == i){ /* (diag of A)*x */ 264 z[2*i] += v[0]*x1 + v[2]*x2; 265 z[2*i+1] += v[2]*x1 + v[3]*x2; 266 v += 4; jmin++; 267 } 268 for (j=jmin; j<n; j++) { 269 /* (strict lower triangular part of A)*x */ 270 cval = ib[j]*2; 271 z[cval] += v[0]*x1 + v[1]*x2; 272 z[cval+1] += v[2]*x1 + v[3]*x2; 273 /* (strict upper triangular part of A)*x */ 274 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 275 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 276 v += 4; 277 } 278 xb +=2; ai++; 279 } 280 281 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 282 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 283 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 284 PetscFunctionReturn(0); 285 } 286 287 #undef __FUNCT__ 288 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 289 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 290 { 291 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 292 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 293 MatScalar *v; 294 PetscErrorCode ierr; 295 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 296 PetscInt nonzerorow=0; 297 298 PetscFunctionBegin; 299 ierr = VecSet(zz,zero);CHKERRQ(ierr); 300 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 301 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 302 303 v = a->a; 304 xb = x; 305 306 for (i=0; i<mbs; i++) { 307 n = ai[1] - ai[0]; /* length of i_th block row of A */ 308 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 309 ib = aj + *ai; 310 jmin = 0; 311 nonzerorow += (n>0); 312 if (*ib == i){ /* (diag of A)*x */ 313 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 314 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 315 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 316 v += 9; jmin++; 317 } 318 for (j=jmin; j<n; j++) { 319 /* (strict lower triangular part of A)*x */ 320 cval = ib[j]*3; 321 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 322 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 323 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 324 /* (strict upper triangular part of A)*x */ 325 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 326 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 327 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 328 v += 9; 329 } 330 xb +=3; ai++; 331 } 332 333 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 334 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 335 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 336 PetscFunctionReturn(0); 337 } 338 339 #undef __FUNCT__ 340 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 341 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 342 { 343 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 344 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 345 MatScalar *v; 346 PetscErrorCode ierr; 347 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 348 PetscInt nonzerorow=0; 349 350 PetscFunctionBegin; 351 ierr = VecSet(zz,zero);CHKERRQ(ierr); 352 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 353 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 354 355 v = a->a; 356 xb = x; 357 358 for (i=0; i<mbs; i++) { 359 n = ai[1] - ai[0]; /* length of i_th block row of A */ 360 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 361 ib = aj + *ai; 362 jmin = 0; 363 nonzerorow += (n>0); 364 if (*ib == i){ /* (diag of A)*x */ 365 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 366 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 367 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 368 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 369 v += 16; jmin++; 370 } 371 for (j=jmin; j<n; j++) { 372 /* (strict lower triangular part of A)*x */ 373 cval = ib[j]*4; 374 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 375 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 376 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 377 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 378 /* (strict upper triangular part of A)*x */ 379 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 380 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 381 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 382 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 383 v += 16; 384 } 385 xb +=4; ai++; 386 } 387 388 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 389 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 390 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 391 PetscFunctionReturn(0); 392 } 393 394 #undef __FUNCT__ 395 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 396 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 397 { 398 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 399 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 400 MatScalar *v; 401 PetscErrorCode ierr; 402 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 403 PetscInt nonzerorow=0; 404 405 PetscFunctionBegin; 406 ierr = VecSet(zz,zero);CHKERRQ(ierr); 407 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 408 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 409 410 v = a->a; 411 xb = x; 412 413 for (i=0; i<mbs; i++) { 414 n = ai[1] - ai[0]; /* length of i_th block row of A */ 415 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 416 ib = aj + *ai; 417 jmin = 0; 418 nonzerorow += (n>0); 419 if (*ib == i){ /* (diag of A)*x */ 420 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 421 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 422 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 423 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 424 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 425 v += 25; jmin++; 426 } 427 for (j=jmin; j<n; j++) { 428 /* (strict lower triangular part of A)*x */ 429 cval = ib[j]*5; 430 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 431 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 432 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 433 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 434 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 435 /* (strict upper triangular part of A)*x */ 436 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 437 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 438 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 439 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 440 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 441 v += 25; 442 } 443 xb +=5; ai++; 444 } 445 446 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 447 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 448 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 449 PetscFunctionReturn(0); 450 } 451 452 453 #undef __FUNCT__ 454 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 455 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 456 { 457 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 458 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 459 MatScalar *v; 460 PetscErrorCode ierr; 461 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 462 PetscInt nonzerorow=0; 463 464 PetscFunctionBegin; 465 ierr = VecSet(zz,zero);CHKERRQ(ierr); 466 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 467 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 468 469 v = a->a; 470 xb = x; 471 472 for (i=0; i<mbs; i++) { 473 n = ai[1] - ai[0]; /* length of i_th block row of A */ 474 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 475 ib = aj + *ai; 476 jmin = 0; 477 nonzerorow += (n>0); 478 if (*ib == i){ /* (diag of A)*x */ 479 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 480 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 481 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 482 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 483 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 484 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 485 v += 36; jmin++; 486 } 487 for (j=jmin; j<n; j++) { 488 /* (strict lower triangular part of A)*x */ 489 cval = ib[j]*6; 490 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 491 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 492 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 493 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 494 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 495 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 496 /* (strict upper triangular part of A)*x */ 497 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 498 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 499 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 500 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 501 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 502 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 503 v += 36; 504 } 505 xb +=6; ai++; 506 } 507 508 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 509 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 510 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 511 PetscFunctionReturn(0); 512 } 513 #undef __FUNCT__ 514 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 515 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 516 { 517 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 518 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 519 MatScalar *v; 520 PetscErrorCode ierr; 521 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 522 PetscInt nonzerorow=0; 523 524 PetscFunctionBegin; 525 ierr = VecSet(zz,zero);CHKERRQ(ierr); 526 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 527 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 528 529 v = a->a; 530 xb = x; 531 532 for (i=0; i<mbs; i++) { 533 n = ai[1] - ai[0]; /* length of i_th block row of A */ 534 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 535 ib = aj + *ai; 536 jmin = 0; 537 nonzerorow += (n>0); 538 if (*ib == i){ /* (diag of A)*x */ 539 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 540 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 541 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 542 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 543 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 544 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 545 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 546 v += 49; jmin++; 547 } 548 for (j=jmin; j<n; j++) { 549 /* (strict lower triangular part of A)*x */ 550 cval = ib[j]*7; 551 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 552 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 553 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 554 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 555 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 556 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 557 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 558 /* (strict upper triangular part of A)*x */ 559 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 560 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 561 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 562 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 563 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 564 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 565 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 566 v += 49; 567 } 568 xb +=7; ai++; 569 } 570 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 571 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 572 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 576 /* 577 This will not work with MatScalar == float because it calls the BLAS 578 */ 579 #undef __FUNCT__ 580 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 581 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 582 { 583 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 584 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 585 MatScalar *v; 586 PetscErrorCode ierr; 587 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 588 PetscInt nonzerorow=0; 589 590 PetscFunctionBegin; 591 ierr = VecSet(zz,zero);CHKERRQ(ierr); 592 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 593 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 594 595 aj = a->j; 596 v = a->a; 597 ii = a->i; 598 599 if (!a->mult_work) { 600 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 601 } 602 work = a->mult_work; 603 604 for (i=0; i<mbs; i++) { 605 n = ii[1] - ii[0]; ncols = n*bs; 606 workt = work; idx=aj+ii[0]; 607 nonzerorow += (n>0); 608 609 /* upper triangular part */ 610 for (j=0; j<n; j++) { 611 xb = x_ptr + bs*(*idx++); 612 for (k=0; k<bs; k++) workt[k] = xb[k]; 613 workt += bs; 614 } 615 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 616 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 617 618 /* strict lower triangular part */ 619 idx = aj+ii[0]; 620 if (*idx == i){ 621 ncols -= bs; v += bs2; idx++; n--; 622 } 623 624 if (ncols > 0){ 625 workt = work; 626 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 627 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 628 for (j=0; j<n; j++) { 629 zb = z_ptr + bs*(*idx++); 630 for (k=0; k<bs; k++) zb[k] += workt[k] ; 631 workt += bs; 632 } 633 } 634 x += bs; v += n*bs2; z += bs; ii++; 635 } 636 637 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 638 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 639 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 640 PetscFunctionReturn(0); 641 } 642 643 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 644 #undef __FUNCT__ 645 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 646 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 647 { 648 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 649 PetscScalar *x,*z,*xb,x1; 650 MatScalar *v; 651 PetscErrorCode ierr; 652 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 653 PetscInt nonzerorow=0; 654 655 PetscFunctionBegin; 656 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 657 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 658 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 659 v = a->a; 660 xb = x; 661 662 for (i=0; i<mbs; i++) { 663 n = ai[1] - ai[0]; /* length of i_th row of A */ 664 x1 = xb[0]; 665 ib = aj + *ai; 666 jmin = 0; 667 nonzerorow += (n>0); 668 if (*ib == i) { /* (diag of A)*x */ 669 z[i] += *v++ * x[*ib++]; jmin++; 670 } 671 for (j=jmin; j<n; j++) { 672 cval = *ib; 673 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 674 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 675 } 676 xb++; ai++; 677 } 678 679 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 680 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 681 682 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 683 PetscFunctionReturn(0); 684 } 685 686 #undef __FUNCT__ 687 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 688 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 689 { 690 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 691 PetscScalar *x,*z,*xb,x1,x2; 692 MatScalar *v; 693 PetscErrorCode ierr; 694 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 695 PetscInt nonzerorow=0; 696 697 PetscFunctionBegin; 698 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 699 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 700 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 701 702 v = a->a; 703 xb = x; 704 705 for (i=0; i<mbs; i++) { 706 n = ai[1] - ai[0]; /* length of i_th block row of A */ 707 x1 = xb[0]; x2 = xb[1]; 708 ib = aj + *ai; 709 jmin = 0; 710 nonzerorow += (n>0); 711 if (*ib == i){ /* (diag of A)*x */ 712 z[2*i] += v[0]*x1 + v[2]*x2; 713 z[2*i+1] += v[2]*x1 + v[3]*x2; 714 v += 4; jmin++; 715 } 716 for (j=jmin; j<n; j++) { 717 /* (strict lower triangular part of A)*x */ 718 cval = ib[j]*2; 719 z[cval] += v[0]*x1 + v[1]*x2; 720 z[cval+1] += v[2]*x1 + v[3]*x2; 721 /* (strict upper triangular part of A)*x */ 722 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 723 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 724 v += 4; 725 } 726 xb +=2; ai++; 727 } 728 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 729 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 730 731 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 732 PetscFunctionReturn(0); 733 } 734 735 #undef __FUNCT__ 736 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 737 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 738 { 739 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 740 PetscScalar *x,*z,*xb,x1,x2,x3; 741 MatScalar *v; 742 PetscErrorCode ierr; 743 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 744 PetscInt nonzerorow=0; 745 746 PetscFunctionBegin; 747 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 748 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 749 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 750 751 v = a->a; 752 xb = x; 753 754 for (i=0; i<mbs; i++) { 755 n = ai[1] - ai[0]; /* length of i_th block row of A */ 756 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 757 ib = aj + *ai; 758 jmin = 0; 759 nonzerorow += (n>0); 760 if (*ib == i){ /* (diag of A)*x */ 761 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 762 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 763 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 764 v += 9; jmin++; 765 } 766 for (j=jmin; j<n; j++) { 767 /* (strict lower triangular part of A)*x */ 768 cval = ib[j]*3; 769 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 770 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 771 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 772 /* (strict upper triangular part of A)*x */ 773 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 774 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 775 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 776 v += 9; 777 } 778 xb +=3; ai++; 779 } 780 781 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 782 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 783 784 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 785 PetscFunctionReturn(0); 786 } 787 788 #undef __FUNCT__ 789 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 790 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 791 { 792 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 793 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 794 MatScalar *v; 795 PetscErrorCode ierr; 796 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 797 PetscInt nonzerorow=0; 798 799 PetscFunctionBegin; 800 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 801 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 802 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 803 804 v = a->a; 805 xb = x; 806 807 for (i=0; i<mbs; i++) { 808 n = ai[1] - ai[0]; /* length of i_th block row of A */ 809 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 810 ib = aj + *ai; 811 jmin = 0; 812 nonzerorow += (n>0); 813 if (*ib == i){ /* (diag of A)*x */ 814 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 815 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 816 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 817 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 818 v += 16; jmin++; 819 } 820 for (j=jmin; j<n; j++) { 821 /* (strict lower triangular part of A)*x */ 822 cval = ib[j]*4; 823 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 824 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 825 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 826 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 827 /* (strict upper triangular part of A)*x */ 828 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 829 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 830 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 831 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 832 v += 16; 833 } 834 xb +=4; ai++; 835 } 836 837 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 838 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 839 840 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 841 PetscFunctionReturn(0); 842 } 843 844 #undef __FUNCT__ 845 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 846 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 847 { 848 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 849 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 850 MatScalar *v; 851 PetscErrorCode ierr; 852 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 853 PetscInt nonzerorow=0; 854 855 PetscFunctionBegin; 856 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 857 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 858 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 859 860 v = a->a; 861 xb = x; 862 863 for (i=0; i<mbs; i++) { 864 n = ai[1] - ai[0]; /* length of i_th block row of A */ 865 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 866 ib = aj + *ai; 867 jmin = 0; 868 nonzerorow += (n>0); 869 if (*ib == i){ /* (diag of A)*x */ 870 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 871 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 872 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 873 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 874 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 875 v += 25; jmin++; 876 } 877 for (j=jmin; j<n; j++) { 878 /* (strict lower triangular part of A)*x */ 879 cval = ib[j]*5; 880 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 881 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 882 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 883 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 884 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 885 /* (strict upper triangular part of A)*x */ 886 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 887 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 888 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 889 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 890 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 891 v += 25; 892 } 893 xb +=5; ai++; 894 } 895 896 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 897 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 898 899 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 900 PetscFunctionReturn(0); 901 } 902 #undef __FUNCT__ 903 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 904 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 905 { 906 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 907 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 908 MatScalar *v; 909 PetscErrorCode ierr; 910 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 911 PetscInt nonzerorow=0; 912 913 PetscFunctionBegin; 914 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 915 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 916 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 917 918 v = a->a; 919 xb = x; 920 921 for (i=0; i<mbs; i++) { 922 n = ai[1] - ai[0]; /* length of i_th block row of A */ 923 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 924 ib = aj + *ai; 925 jmin = 0; 926 nonzerorow += (n>0); 927 if (*ib == i){ /* (diag of A)*x */ 928 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 929 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 930 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 931 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 932 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 933 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 934 v += 36; jmin++; 935 } 936 for (j=jmin; j<n; j++) { 937 /* (strict lower triangular part of A)*x */ 938 cval = ib[j]*6; 939 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 940 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 941 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 942 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 943 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 944 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 945 /* (strict upper triangular part of A)*x */ 946 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 947 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 948 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 949 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 950 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 951 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 952 v += 36; 953 } 954 xb +=6; ai++; 955 } 956 957 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 958 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 959 960 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 961 PetscFunctionReturn(0); 962 } 963 964 #undef __FUNCT__ 965 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 966 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 967 { 968 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 969 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 970 MatScalar *v; 971 PetscErrorCode ierr; 972 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 973 PetscInt nonzerorow=0; 974 975 PetscFunctionBegin; 976 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 977 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 978 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 979 980 v = a->a; 981 xb = x; 982 983 for (i=0; i<mbs; i++) { 984 n = ai[1] - ai[0]; /* length of i_th block row of A */ 985 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 986 ib = aj + *ai; 987 jmin = 0; 988 nonzerorow += (n>0); 989 if (*ib == i){ /* (diag of A)*x */ 990 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 991 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 992 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 993 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 994 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 995 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 996 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 997 v += 49; jmin++; 998 } 999 for (j=jmin; j<n; j++) { 1000 /* (strict lower triangular part of A)*x */ 1001 cval = ib[j]*7; 1002 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1003 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1004 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1005 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1006 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1007 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1008 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1009 /* (strict upper triangular part of A)*x */ 1010 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1011 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1012 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1013 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1014 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1015 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1016 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1017 v += 49; 1018 } 1019 xb +=7; ai++; 1020 } 1021 1022 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1023 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1024 1025 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1026 PetscFunctionReturn(0); 1027 } 1028 1029 #undef __FUNCT__ 1030 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1031 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1032 { 1033 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1034 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1035 MatScalar *v; 1036 PetscErrorCode ierr; 1037 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1038 PetscInt nonzerorow=0; 1039 1040 PetscFunctionBegin; 1041 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1042 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1043 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1044 1045 aj = a->j; 1046 v = a->a; 1047 ii = a->i; 1048 1049 if (!a->mult_work) { 1050 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1051 } 1052 work = a->mult_work; 1053 1054 1055 for (i=0; i<mbs; i++) { 1056 n = ii[1] - ii[0]; ncols = n*bs; 1057 workt = work; idx=aj+ii[0]; 1058 nonzerorow += (n>0); 1059 1060 /* upper triangular part */ 1061 for (j=0; j<n; j++) { 1062 xb = x_ptr + bs*(*idx++); 1063 for (k=0; k<bs; k++) workt[k] = xb[k]; 1064 workt += bs; 1065 } 1066 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1067 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1068 1069 /* strict lower triangular part */ 1070 idx = aj+ii[0]; 1071 if (*idx == i){ 1072 ncols -= bs; v += bs2; idx++; n--; 1073 } 1074 if (ncols > 0){ 1075 workt = work; 1076 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1077 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1078 for (j=0; j<n; j++) { 1079 zb = z_ptr + bs*(*idx++); 1080 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1081 workt += bs; 1082 } 1083 } 1084 1085 x += bs; v += n*bs2; z += bs; ii++; 1086 } 1087 1088 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1090 1091 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 #undef __FUNCT__ 1096 #define __FUNCT__ "MatScale_SeqSBAIJ" 1097 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1098 { 1099 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1100 PetscScalar oalpha = alpha; 1101 PetscErrorCode ierr; 1102 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1103 1104 PetscFunctionBegin; 1105 BLASscal_(&totalnz,&oalpha,a->a,&one); 1106 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1107 PetscFunctionReturn(0); 1108 } 1109 1110 #undef __FUNCT__ 1111 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1112 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1113 { 1114 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1115 MatScalar *v = a->a; 1116 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1117 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1118 PetscErrorCode ierr; 1119 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1120 1121 PetscFunctionBegin; 1122 if (type == NORM_FROBENIUS) { 1123 for (k=0; k<mbs; k++){ 1124 jmin = a->i[k]; jmax = a->i[k+1]; 1125 col = aj + jmin; 1126 if (*col == k){ /* diagonal block */ 1127 for (i=0; i<bs2; i++){ 1128 #if defined(PETSC_USE_COMPLEX) 1129 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1130 #else 1131 sum_diag += (*v)*(*v); v++; 1132 #endif 1133 } 1134 jmin++; 1135 } 1136 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1137 for (i=0; i<bs2; i++){ 1138 #if defined(PETSC_USE_COMPLEX) 1139 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1140 #else 1141 sum_off += (*v)*(*v); v++; 1142 #endif 1143 } 1144 } 1145 } 1146 *norm = sqrt(sum_diag + 2*sum_off); 1147 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1148 ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 1149 for (i=0; i<mbs; i++) jl[i] = mbs; 1150 il[0] = 0; 1151 1152 *norm = 0.0; 1153 for (k=0; k<mbs; k++) { /* k_th block row */ 1154 for (j=0; j<bs; j++) sum[j]=0.0; 1155 /*-- col sum --*/ 1156 i = jl[k]; /* first |A(i,k)| to be added */ 1157 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1158 at step k */ 1159 while (i<mbs){ 1160 nexti = jl[i]; /* next block row to be added */ 1161 ik = il[i]; /* block index of A(i,k) in the array a */ 1162 for (j=0; j<bs; j++){ 1163 v = a->a + ik*bs2 + j*bs; 1164 for (k1=0; k1<bs; k1++) { 1165 sum[j] += PetscAbsScalar(*v); v++; 1166 } 1167 } 1168 /* update il, jl */ 1169 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1170 jmax = a->i[i+1]; 1171 if (jmin < jmax){ 1172 il[i] = jmin; 1173 j = a->j[jmin]; 1174 jl[i] = jl[j]; jl[j]=i; 1175 } 1176 i = nexti; 1177 } 1178 /*-- row sum --*/ 1179 jmin = a->i[k]; jmax = a->i[k+1]; 1180 for (i=jmin; i<jmax; i++) { 1181 for (j=0; j<bs; j++){ 1182 v = a->a + i*bs2 + j; 1183 for (k1=0; k1<bs; k1++){ 1184 sum[j] += PetscAbsScalar(*v); v += bs; 1185 } 1186 } 1187 } 1188 /* add k_th block row to il, jl */ 1189 col = aj+jmin; 1190 if (*col == k) jmin++; 1191 if (jmin < jmax){ 1192 il[k] = jmin; 1193 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1194 } 1195 for (j=0; j<bs; j++){ 1196 if (sum[j] > *norm) *norm = sum[j]; 1197 } 1198 } 1199 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1200 } else { 1201 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1202 } 1203 PetscFunctionReturn(0); 1204 } 1205 1206 #undef __FUNCT__ 1207 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1208 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1209 { 1210 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1211 PetscErrorCode ierr; 1212 1213 PetscFunctionBegin; 1214 1215 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1216 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1217 *flg = PETSC_FALSE; 1218 PetscFunctionReturn(0); 1219 } 1220 1221 /* if the a->i are the same */ 1222 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1223 if (!*flg) { 1224 PetscFunctionReturn(0); 1225 } 1226 1227 /* if a->j are the same */ 1228 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1229 if (!*flg) { 1230 PetscFunctionReturn(0); 1231 } 1232 /* if a->a are the same */ 1233 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1234 PetscFunctionReturn(0); 1235 } 1236 1237 #undef __FUNCT__ 1238 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1239 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1240 { 1241 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1242 PetscErrorCode ierr; 1243 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1244 PetscScalar *x,zero = 0.0; 1245 MatScalar *aa,*aa_j; 1246 1247 PetscFunctionBegin; 1248 bs = A->rmap->bs; 1249 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1250 1251 aa = a->a; 1252 ai = a->i; 1253 aj = a->j; 1254 ambs = a->mbs; 1255 bs2 = a->bs2; 1256 1257 ierr = VecSet(v,zero);CHKERRQ(ierr); 1258 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1259 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1260 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1261 for (i=0; i<ambs; i++) { 1262 j=ai[i]; 1263 if (aj[j] == i) { /* if this is a diagonal element */ 1264 row = i*bs; 1265 aa_j = aa + j*bs2; 1266 if (A->factor && bs==1){ 1267 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1268 } else { 1269 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1270 } 1271 } 1272 } 1273 1274 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1280 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1281 { 1282 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1283 PetscScalar *l,x,*li,*ri; 1284 MatScalar *aa,*v; 1285 PetscErrorCode ierr; 1286 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1287 PetscTruth flg; 1288 1289 PetscFunctionBegin; 1290 if (ll != rr){ 1291 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1292 if (!flg) 1293 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1294 } 1295 if (!ll) PetscFunctionReturn(0); 1296 ai = a->i; 1297 aj = a->j; 1298 aa = a->a; 1299 m = A->rmap->N; 1300 bs = A->rmap->bs; 1301 mbs = a->mbs; 1302 bs2 = a->bs2; 1303 1304 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1305 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1306 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1307 for (i=0; i<mbs; i++) { /* for each block row */ 1308 M = ai[i+1] - ai[i]; 1309 li = l + i*bs; 1310 v = aa + bs2*ai[i]; 1311 for (j=0; j<M; j++) { /* for each block */ 1312 ri = l + bs*aj[ai[i]+j]; 1313 for (k=0; k<bs; k++) { 1314 x = ri[k]; 1315 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1316 } 1317 } 1318 } 1319 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1320 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1321 PetscFunctionReturn(0); 1322 } 1323 1324 #undef __FUNCT__ 1325 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1326 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1327 { 1328 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1329 1330 PetscFunctionBegin; 1331 info->block_size = a->bs2; 1332 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1333 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1334 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1335 info->assemblies = A->num_ass; 1336 info->mallocs = a->reallocs; 1337 info->memory = ((PetscObject)A)->mem; 1338 if (A->factor) { 1339 info->fill_ratio_given = A->info.fill_ratio_given; 1340 info->fill_ratio_needed = A->info.fill_ratio_needed; 1341 info->factor_mallocs = A->info.factor_mallocs; 1342 } else { 1343 info->fill_ratio_given = 0; 1344 info->fill_ratio_needed = 0; 1345 info->factor_mallocs = 0; 1346 } 1347 PetscFunctionReturn(0); 1348 } 1349 1350 1351 #undef __FUNCT__ 1352 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1353 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1354 { 1355 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1356 PetscErrorCode ierr; 1357 1358 PetscFunctionBegin; 1359 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1360 PetscFunctionReturn(0); 1361 } 1362 1363 #undef __FUNCT__ 1364 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1365 /* 1366 This code does not work since it only checks the upper triangular part of 1367 the matrix. Hence it is not listed in the function table. 1368 */ 1369 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1370 { 1371 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1372 PetscErrorCode ierr; 1373 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1374 PetscReal atmp; 1375 MatScalar *aa; 1376 PetscScalar *x; 1377 PetscInt ncols,brow,bcol,krow,kcol; 1378 1379 PetscFunctionBegin; 1380 if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1381 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1382 bs = A->rmap->bs; 1383 aa = a->a; 1384 ai = a->i; 1385 aj = a->j; 1386 mbs = a->mbs; 1387 1388 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1389 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1390 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1391 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1392 for (i=0; i<mbs; i++) { 1393 ncols = ai[1] - ai[0]; ai++; 1394 brow = bs*i; 1395 for (j=0; j<ncols; j++){ 1396 bcol = bs*(*aj); 1397 for (kcol=0; kcol<bs; kcol++){ 1398 col = bcol + kcol; /* col index */ 1399 for (krow=0; krow<bs; krow++){ 1400 atmp = PetscAbsScalar(*aa); aa++; 1401 row = brow + krow; /* row index */ 1402 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1403 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1404 } 1405 } 1406 aj++; 1407 } 1408 } 1409 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1410 PetscFunctionReturn(0); 1411 } 1412