1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 #include "../src/mat/impls/sbaij/seq/sbaij.h" 7 #include "petscblaslapack.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 const PetscInt *idx; 17 PetscBT table,table0; 18 19 PetscFunctionBegin; 20 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 21 mbs = a->mbs; 22 ai = a->i; 23 aj = a->j; 24 bs = A->rmap->bs; 25 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 26 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 27 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 28 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { /* for each is */ 31 isz = 0; 32 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 33 34 /* Extract the indices, assume there can be duplicate entries */ 35 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 36 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 37 38 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 39 bcol_max = 0; 40 for (j=0; j<n ; ++j){ 41 brow = idx[j]/bs; /* convert the indices into block indices */ 42 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,brow)) { 44 nidx[isz++] = brow; 45 if (bcol_max < brow) bcol_max = brow; 46 } 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++){ /* for each overlap */ 53 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 54 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 55 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 56 57 n = isz; /* length of the updated is[i] */ 58 for (brow=0; brow<mbs; brow++){ 59 start = ai[brow]; end = ai[brow+1]; 60 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 61 for (l = start; l<end ; l++){ 62 bcol = aj[l]; 63 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end ; l++){ 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table0,bcol)){ 72 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) 83 nidx2[j*bs+k] = nidx[j]*bs+k; 84 } 85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 86 } 87 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 88 ierr = PetscFree(nidx);CHKERRQ(ierr); 89 ierr = PetscFree(nidx2);CHKERRQ(ierr); 90 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 #undef __FUNCT__ 95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 97 { 98 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 99 PetscErrorCode ierr; 100 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 101 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 102 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 103 const PetscInt *irow,*aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscTruth flag,sorted; 107 108 PetscFunctionBegin; 109 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 110 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 111 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 117 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 120 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 121 /* determine lens of each row */ 122 for (i=0; i<nrows; i++) { 123 kstart = ai[irow[i]]; 124 kend = kstart + a->ilen[irow[i]]; 125 lens[i] = 0; 126 for (k=kstart; k<kend; k++) { 127 if (ssmap[aj[k]]) { 128 lens[i]++; 129 } 130 } 131 } 132 /* Create and fill new matrix */ 133 if (scall == MAT_REUSE_MATRIX) { 134 c = (Mat_SeqSBAIJ *)((*B)->data); 135 136 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 137 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 138 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 140 C = *B; 141 } else { 142 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 143 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 144 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 145 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 146 } 147 c = (Mat_SeqSBAIJ *)(C->data); 148 for (i=0; i<nrows; i++) { 149 row = irow[i]; 150 kstart = ai[row]; 151 kend = kstart + a->ilen[row]; 152 mat_i = c->i[i]; 153 mat_j = c->j + mat_i; 154 mat_a = c->a + mat_i*bs2; 155 mat_ilen = c->ilen + i; 156 for (k=kstart; k<kend; k++) { 157 if ((tcol=ssmap[a->j[k]])) { 158 *mat_j++ = tcol - 1; 159 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 160 mat_a += bs2; 161 (*mat_ilen)++; 162 } 163 } 164 } 165 166 /* Free work space */ 167 ierr = PetscFree(smap);CHKERRQ(ierr); 168 ierr = PetscFree(lens);CHKERRQ(ierr); 169 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 170 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 172 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 173 *B = C; 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 179 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 180 { 181 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 182 IS is1; 183 PetscErrorCode ierr; 184 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 185 const PetscInt *irow; 186 187 PetscFunctionBegin; 188 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 189 190 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 192 193 /* Verify if the indices corespond to each element in a block 194 and form the IS with compressed IS */ 195 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 196 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 197 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 198 199 count = 0; 200 for (i=0; i<a->mbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 202 if (vary[i]==bs) iary[count++] = i; 203 } 204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 206 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 207 208 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 209 ierr = ISDestroy(is1);CHKERRQ(ierr); 210 PetscFunctionReturn(0); 211 } 212 213 #undef __FUNCT__ 214 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 215 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 216 { 217 PetscErrorCode ierr; 218 PetscInt i; 219 220 PetscFunctionBegin; 221 if (scall == MAT_INITIAL_MATRIX) { 222 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 223 } 224 225 for (i=0; i<n; i++) { 226 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 /* -------------------------------------------------------*/ 232 /* Should check that shapes of vectors and matrices match */ 233 /* -------------------------------------------------------*/ 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 237 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 238 { 239 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 240 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 241 MatScalar *v; 242 PetscErrorCode ierr; 243 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 244 PetscInt nonzerorow=0; 245 246 PetscFunctionBegin; 247 ierr = VecSet(zz,zero);CHKERRQ(ierr); 248 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 250 251 v = a->a; 252 xb = x; 253 254 for (i=0; i<mbs; i++) { 255 n = ai[1] - ai[0]; /* length of i_th block row of A */ 256 x1 = xb[0]; x2 = xb[1]; 257 ib = aj + *ai; 258 jmin = 0; 259 nonzerorow += (n>0); 260 if (*ib == i){ /* (diag of A)*x */ 261 z[2*i] += v[0]*x1 + v[2]*x2; 262 z[2*i+1] += v[2]*x1 + v[3]*x2; 263 v += 4; jmin++; 264 } 265 for (j=jmin; j<n; j++) { 266 /* (strict lower triangular part of A)*x */ 267 cval = ib[j]*2; 268 z[cval] += v[0]*x1 + v[1]*x2; 269 z[cval+1] += v[2]*x1 + v[3]*x2; 270 /* (strict upper triangular part of A)*x */ 271 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 272 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 273 v += 4; 274 } 275 xb +=2; ai++; 276 } 277 278 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 279 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 280 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 281 PetscFunctionReturn(0); 282 } 283 284 #undef __FUNCT__ 285 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 286 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 287 { 288 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 289 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 290 MatScalar *v; 291 PetscErrorCode ierr; 292 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 293 PetscInt nonzerorow=0; 294 295 PetscFunctionBegin; 296 ierr = VecSet(zz,zero);CHKERRQ(ierr); 297 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 298 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 299 300 v = a->a; 301 xb = x; 302 303 for (i=0; i<mbs; i++) { 304 n = ai[1] - ai[0]; /* length of i_th block row of A */ 305 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 306 ib = aj + *ai; 307 jmin = 0; 308 nonzerorow += (n>0); 309 if (*ib == i){ /* (diag of A)*x */ 310 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 311 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 312 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 313 v += 9; jmin++; 314 } 315 for (j=jmin; j<n; j++) { 316 /* (strict lower triangular part of A)*x */ 317 cval = ib[j]*3; 318 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 319 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 320 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 321 /* (strict upper triangular part of A)*x */ 322 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 323 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 324 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 325 v += 9; 326 } 327 xb +=3; ai++; 328 } 329 330 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 331 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 332 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 333 PetscFunctionReturn(0); 334 } 335 336 #undef __FUNCT__ 337 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 338 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 339 { 340 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 341 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 342 MatScalar *v; 343 PetscErrorCode ierr; 344 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 345 PetscInt nonzerorow=0; 346 347 PetscFunctionBegin; 348 ierr = VecSet(zz,zero);CHKERRQ(ierr); 349 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 350 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 351 352 v = a->a; 353 xb = x; 354 355 for (i=0; i<mbs; i++) { 356 n = ai[1] - ai[0]; /* length of i_th block row of A */ 357 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 358 ib = aj + *ai; 359 jmin = 0; 360 nonzerorow += (n>0); 361 if (*ib == i){ /* (diag of A)*x */ 362 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 363 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 364 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 365 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 366 v += 16; jmin++; 367 } 368 for (j=jmin; j<n; j++) { 369 /* (strict lower triangular part of A)*x */ 370 cval = ib[j]*4; 371 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 372 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 373 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 374 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 375 /* (strict upper triangular part of A)*x */ 376 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 377 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 378 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 379 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 380 v += 16; 381 } 382 xb +=4; ai++; 383 } 384 385 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 386 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 387 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 388 PetscFunctionReturn(0); 389 } 390 391 #undef __FUNCT__ 392 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 393 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 394 { 395 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 396 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 397 MatScalar *v; 398 PetscErrorCode ierr; 399 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 400 PetscInt nonzerorow=0; 401 402 PetscFunctionBegin; 403 ierr = VecSet(zz,zero);CHKERRQ(ierr); 404 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 405 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 406 407 v = a->a; 408 xb = x; 409 410 for (i=0; i<mbs; i++) { 411 n = ai[1] - ai[0]; /* length of i_th block row of A */ 412 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 413 ib = aj + *ai; 414 jmin = 0; 415 nonzerorow += (n>0); 416 if (*ib == i){ /* (diag of A)*x */ 417 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 418 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 419 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 420 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 421 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 422 v += 25; jmin++; 423 } 424 for (j=jmin; j<n; j++) { 425 /* (strict lower triangular part of A)*x */ 426 cval = ib[j]*5; 427 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 428 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 429 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 430 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 431 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 432 /* (strict upper triangular part of A)*x */ 433 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 434 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 435 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 436 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 437 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 438 v += 25; 439 } 440 xb +=5; ai++; 441 } 442 443 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 444 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 445 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 446 PetscFunctionReturn(0); 447 } 448 449 450 #undef __FUNCT__ 451 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 452 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 453 { 454 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 455 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 456 MatScalar *v; 457 PetscErrorCode ierr; 458 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 459 PetscInt nonzerorow=0; 460 461 PetscFunctionBegin; 462 ierr = VecSet(zz,zero);CHKERRQ(ierr); 463 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 464 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 465 466 v = a->a; 467 xb = x; 468 469 for (i=0; i<mbs; i++) { 470 n = ai[1] - ai[0]; /* length of i_th block row of A */ 471 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 472 ib = aj + *ai; 473 jmin = 0; 474 nonzerorow += (n>0); 475 if (*ib == i){ /* (diag of A)*x */ 476 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 477 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 478 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 479 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 480 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 481 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 482 v += 36; jmin++; 483 } 484 for (j=jmin; j<n; j++) { 485 /* (strict lower triangular part of A)*x */ 486 cval = ib[j]*6; 487 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 488 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 489 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 490 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 491 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 492 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 493 /* (strict upper triangular part of A)*x */ 494 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 495 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 496 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 497 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 498 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 499 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 500 v += 36; 501 } 502 xb +=6; ai++; 503 } 504 505 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 506 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 507 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 508 PetscFunctionReturn(0); 509 } 510 #undef __FUNCT__ 511 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 512 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 513 { 514 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 515 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 516 MatScalar *v; 517 PetscErrorCode ierr; 518 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 519 PetscInt nonzerorow=0; 520 521 PetscFunctionBegin; 522 ierr = VecSet(zz,zero);CHKERRQ(ierr); 523 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 524 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 525 526 v = a->a; 527 xb = x; 528 529 for (i=0; i<mbs; i++) { 530 n = ai[1] - ai[0]; /* length of i_th block row of A */ 531 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 532 ib = aj + *ai; 533 jmin = 0; 534 nonzerorow += (n>0); 535 if (*ib == i){ /* (diag of A)*x */ 536 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 537 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 538 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 539 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 540 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 541 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 542 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 543 v += 49; jmin++; 544 } 545 for (j=jmin; j<n; j++) { 546 /* (strict lower triangular part of A)*x */ 547 cval = ib[j]*7; 548 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 549 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 550 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 551 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 552 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 553 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 554 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 555 /* (strict upper triangular part of A)*x */ 556 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 557 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 558 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 559 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 560 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 561 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 562 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 563 v += 49; 564 } 565 xb +=7; ai++; 566 } 567 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 568 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 569 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 570 PetscFunctionReturn(0); 571 } 572 573 /* 574 This will not work with MatScalar == float because it calls the BLAS 575 */ 576 #undef __FUNCT__ 577 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 578 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 579 { 580 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 581 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 582 MatScalar *v; 583 PetscErrorCode ierr; 584 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 585 PetscInt nonzerorow=0; 586 587 PetscFunctionBegin; 588 ierr = VecSet(zz,zero);CHKERRQ(ierr); 589 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 590 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 591 592 aj = a->j; 593 v = a->a; 594 ii = a->i; 595 596 if (!a->mult_work) { 597 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 598 } 599 work = a->mult_work; 600 601 for (i=0; i<mbs; i++) { 602 n = ii[1] - ii[0]; ncols = n*bs; 603 workt = work; idx=aj+ii[0]; 604 nonzerorow += (n>0); 605 606 /* upper triangular part */ 607 for (j=0; j<n; j++) { 608 xb = x_ptr + bs*(*idx++); 609 for (k=0; k<bs; k++) workt[k] = xb[k]; 610 workt += bs; 611 } 612 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 613 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 614 615 /* strict lower triangular part */ 616 idx = aj+ii[0]; 617 if (*idx == i){ 618 ncols -= bs; v += bs2; idx++; n--; 619 } 620 621 if (ncols > 0){ 622 workt = work; 623 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 624 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 625 for (j=0; j<n; j++) { 626 zb = z_ptr + bs*(*idx++); 627 for (k=0; k<bs; k++) zb[k] += workt[k] ; 628 workt += bs; 629 } 630 } 631 x += bs; v += n*bs2; z += bs; ii++; 632 } 633 634 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 635 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 636 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 637 PetscFunctionReturn(0); 638 } 639 640 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 641 #undef __FUNCT__ 642 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 643 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 644 { 645 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 646 PetscScalar *x,*z,*xb,x1; 647 MatScalar *v; 648 PetscErrorCode ierr; 649 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 650 PetscInt nonzerorow=0; 651 652 PetscFunctionBegin; 653 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 654 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 655 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 656 v = a->a; 657 xb = x; 658 659 for (i=0; i<mbs; i++) { 660 n = ai[1] - ai[0]; /* length of i_th row of A */ 661 x1 = xb[0]; 662 ib = aj + *ai; 663 jmin = 0; 664 nonzerorow += (n>0); 665 if (*ib == i) { /* (diag of A)*x */ 666 z[i] += *v++ * x[*ib++]; jmin++; 667 } 668 for (j=jmin; j<n; j++) { 669 cval = *ib; 670 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 671 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 672 } 673 xb++; ai++; 674 } 675 676 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 677 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 678 679 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 680 PetscFunctionReturn(0); 681 } 682 683 #undef __FUNCT__ 684 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 685 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 686 { 687 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 688 PetscScalar *x,*z,*xb,x1,x2; 689 MatScalar *v; 690 PetscErrorCode ierr; 691 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 692 PetscInt nonzerorow=0; 693 694 PetscFunctionBegin; 695 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 696 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 697 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 698 699 v = a->a; 700 xb = x; 701 702 for (i=0; i<mbs; i++) { 703 n = ai[1] - ai[0]; /* length of i_th block row of A */ 704 x1 = xb[0]; x2 = xb[1]; 705 ib = aj + *ai; 706 jmin = 0; 707 nonzerorow += (n>0); 708 if (*ib == i){ /* (diag of A)*x */ 709 z[2*i] += v[0]*x1 + v[2]*x2; 710 z[2*i+1] += v[2]*x1 + v[3]*x2; 711 v += 4; jmin++; 712 } 713 for (j=jmin; j<n; j++) { 714 /* (strict lower triangular part of A)*x */ 715 cval = ib[j]*2; 716 z[cval] += v[0]*x1 + v[1]*x2; 717 z[cval+1] += v[2]*x1 + v[3]*x2; 718 /* (strict upper triangular part of A)*x */ 719 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 720 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 721 v += 4; 722 } 723 xb +=2; ai++; 724 } 725 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 726 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 727 728 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 729 PetscFunctionReturn(0); 730 } 731 732 #undef __FUNCT__ 733 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 734 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 735 { 736 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 737 PetscScalar *x,*z,*xb,x1,x2,x3; 738 MatScalar *v; 739 PetscErrorCode ierr; 740 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 741 PetscInt nonzerorow=0; 742 743 PetscFunctionBegin; 744 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 745 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 746 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 747 748 v = a->a; 749 xb = x; 750 751 for (i=0; i<mbs; i++) { 752 n = ai[1] - ai[0]; /* length of i_th block row of A */ 753 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 754 ib = aj + *ai; 755 jmin = 0; 756 nonzerorow += (n>0); 757 if (*ib == i){ /* (diag of A)*x */ 758 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 759 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 760 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 761 v += 9; jmin++; 762 } 763 for (j=jmin; j<n; j++) { 764 /* (strict lower triangular part of A)*x */ 765 cval = ib[j]*3; 766 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 767 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 768 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 769 /* (strict upper triangular part of A)*x */ 770 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 771 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 772 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 773 v += 9; 774 } 775 xb +=3; ai++; 776 } 777 778 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 779 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 780 781 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 782 PetscFunctionReturn(0); 783 } 784 785 #undef __FUNCT__ 786 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 787 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 788 { 789 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 790 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 791 MatScalar *v; 792 PetscErrorCode ierr; 793 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 794 PetscInt nonzerorow=0; 795 796 PetscFunctionBegin; 797 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 800 801 v = a->a; 802 xb = x; 803 804 for (i=0; i<mbs; i++) { 805 n = ai[1] - ai[0]; /* length of i_th block row of A */ 806 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 807 ib = aj + *ai; 808 jmin = 0; 809 nonzerorow += (n>0); 810 if (*ib == i){ /* (diag of A)*x */ 811 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 812 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 813 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 814 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 815 v += 16; jmin++; 816 } 817 for (j=jmin; j<n; j++) { 818 /* (strict lower triangular part of A)*x */ 819 cval = ib[j]*4; 820 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 821 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 822 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 823 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 824 /* (strict upper triangular part of A)*x */ 825 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 826 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 827 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 828 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 829 v += 16; 830 } 831 xb +=4; ai++; 832 } 833 834 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 835 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 836 837 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 838 PetscFunctionReturn(0); 839 } 840 841 #undef __FUNCT__ 842 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 843 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 844 { 845 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 846 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 847 MatScalar *v; 848 PetscErrorCode ierr; 849 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 850 PetscInt nonzerorow=0; 851 852 PetscFunctionBegin; 853 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 854 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 855 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 856 857 v = a->a; 858 xb = x; 859 860 for (i=0; i<mbs; i++) { 861 n = ai[1] - ai[0]; /* length of i_th block row of A */ 862 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 863 ib = aj + *ai; 864 jmin = 0; 865 nonzerorow += (n>0); 866 if (*ib == i){ /* (diag of A)*x */ 867 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 868 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 869 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 870 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 871 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 872 v += 25; jmin++; 873 } 874 for (j=jmin; j<n; j++) { 875 /* (strict lower triangular part of A)*x */ 876 cval = ib[j]*5; 877 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 878 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 879 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 880 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 881 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 882 /* (strict upper triangular part of A)*x */ 883 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 884 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 885 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 886 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 887 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 888 v += 25; 889 } 890 xb +=5; ai++; 891 } 892 893 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 894 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 895 896 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 897 PetscFunctionReturn(0); 898 } 899 #undef __FUNCT__ 900 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 901 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 902 { 903 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 904 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 905 MatScalar *v; 906 PetscErrorCode ierr; 907 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 908 PetscInt nonzerorow=0; 909 910 PetscFunctionBegin; 911 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 912 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 913 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 914 915 v = a->a; 916 xb = x; 917 918 for (i=0; i<mbs; i++) { 919 n = ai[1] - ai[0]; /* length of i_th block row of A */ 920 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 921 ib = aj + *ai; 922 jmin = 0; 923 nonzerorow += (n>0); 924 if (*ib == i){ /* (diag of A)*x */ 925 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 926 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 927 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 928 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 929 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 930 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 931 v += 36; jmin++; 932 } 933 for (j=jmin; j<n; j++) { 934 /* (strict lower triangular part of A)*x */ 935 cval = ib[j]*6; 936 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 937 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 938 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 939 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 940 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 941 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 942 /* (strict upper triangular part of A)*x */ 943 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 944 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 945 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 946 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 947 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 948 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 949 v += 36; 950 } 951 xb +=6; ai++; 952 } 953 954 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 955 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 956 957 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 958 PetscFunctionReturn(0); 959 } 960 961 #undef __FUNCT__ 962 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 963 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 964 { 965 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 966 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 967 MatScalar *v; 968 PetscErrorCode ierr; 969 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 970 PetscInt nonzerorow=0; 971 972 PetscFunctionBegin; 973 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 974 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 975 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 976 977 v = a->a; 978 xb = x; 979 980 for (i=0; i<mbs; i++) { 981 n = ai[1] - ai[0]; /* length of i_th block row of A */ 982 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 983 ib = aj + *ai; 984 jmin = 0; 985 nonzerorow += (n>0); 986 if (*ib == i){ /* (diag of A)*x */ 987 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 988 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 989 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 990 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 991 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 992 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 993 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 994 v += 49; jmin++; 995 } 996 for (j=jmin; j<n; j++) { 997 /* (strict lower triangular part of A)*x */ 998 cval = ib[j]*7; 999 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1000 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1001 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1002 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1003 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1004 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1005 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1006 /* (strict upper triangular part of A)*x */ 1007 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1008 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1009 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1010 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1011 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1012 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1013 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1014 v += 49; 1015 } 1016 xb +=7; ai++; 1017 } 1018 1019 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1020 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1021 1022 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1023 PetscFunctionReturn(0); 1024 } 1025 1026 #undef __FUNCT__ 1027 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1028 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1029 { 1030 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1031 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1032 MatScalar *v; 1033 PetscErrorCode ierr; 1034 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1035 PetscInt nonzerorow=0; 1036 1037 PetscFunctionBegin; 1038 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1039 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1040 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1041 1042 aj = a->j; 1043 v = a->a; 1044 ii = a->i; 1045 1046 if (!a->mult_work) { 1047 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1048 } 1049 work = a->mult_work; 1050 1051 1052 for (i=0; i<mbs; i++) { 1053 n = ii[1] - ii[0]; ncols = n*bs; 1054 workt = work; idx=aj+ii[0]; 1055 nonzerorow += (n>0); 1056 1057 /* upper triangular part */ 1058 for (j=0; j<n; j++) { 1059 xb = x_ptr + bs*(*idx++); 1060 for (k=0; k<bs; k++) workt[k] = xb[k]; 1061 workt += bs; 1062 } 1063 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1064 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1065 1066 /* strict lower triangular part */ 1067 idx = aj+ii[0]; 1068 if (*idx == i){ 1069 ncols -= bs; v += bs2; idx++; n--; 1070 } 1071 if (ncols > 0){ 1072 workt = work; 1073 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1074 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1075 for (j=0; j<n; j++) { 1076 zb = z_ptr + bs*(*idx++); 1077 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1078 workt += bs; 1079 } 1080 } 1081 1082 x += bs; v += n*bs2; z += bs; ii++; 1083 } 1084 1085 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1086 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1087 1088 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1089 PetscFunctionReturn(0); 1090 } 1091 1092 #undef __FUNCT__ 1093 #define __FUNCT__ "MatScale_SeqSBAIJ" 1094 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1095 { 1096 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1097 PetscScalar oalpha = alpha; 1098 PetscErrorCode ierr; 1099 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1100 1101 PetscFunctionBegin; 1102 BLASscal_(&totalnz,&oalpha,a->a,&one); 1103 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1104 PetscFunctionReturn(0); 1105 } 1106 1107 #undef __FUNCT__ 1108 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1109 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1110 { 1111 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1112 MatScalar *v = a->a; 1113 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1114 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1115 PetscErrorCode ierr; 1116 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1117 1118 PetscFunctionBegin; 1119 if (type == NORM_FROBENIUS) { 1120 for (k=0; k<mbs; k++){ 1121 jmin = a->i[k]; jmax = a->i[k+1]; 1122 col = aj + jmin; 1123 if (*col == k){ /* diagonal block */ 1124 for (i=0; i<bs2; i++){ 1125 #if defined(PETSC_USE_COMPLEX) 1126 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1127 #else 1128 sum_diag += (*v)*(*v); v++; 1129 #endif 1130 } 1131 jmin++; 1132 } 1133 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1134 for (i=0; i<bs2; i++){ 1135 #if defined(PETSC_USE_COMPLEX) 1136 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1137 #else 1138 sum_off += (*v)*(*v); v++; 1139 #endif 1140 } 1141 } 1142 } 1143 *norm = sqrt(sum_diag + 2*sum_off); 1144 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1145 ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 1146 for (i=0; i<mbs; i++) jl[i] = mbs; 1147 il[0] = 0; 1148 1149 *norm = 0.0; 1150 for (k=0; k<mbs; k++) { /* k_th block row */ 1151 for (j=0; j<bs; j++) sum[j]=0.0; 1152 /*-- col sum --*/ 1153 i = jl[k]; /* first |A(i,k)| to be added */ 1154 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1155 at step k */ 1156 while (i<mbs){ 1157 nexti = jl[i]; /* next block row to be added */ 1158 ik = il[i]; /* block index of A(i,k) in the array a */ 1159 for (j=0; j<bs; j++){ 1160 v = a->a + ik*bs2 + j*bs; 1161 for (k1=0; k1<bs; k1++) { 1162 sum[j] += PetscAbsScalar(*v); v++; 1163 } 1164 } 1165 /* update il, jl */ 1166 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1167 jmax = a->i[i+1]; 1168 if (jmin < jmax){ 1169 il[i] = jmin; 1170 j = a->j[jmin]; 1171 jl[i] = jl[j]; jl[j]=i; 1172 } 1173 i = nexti; 1174 } 1175 /*-- row sum --*/ 1176 jmin = a->i[k]; jmax = a->i[k+1]; 1177 for (i=jmin; i<jmax; i++) { 1178 for (j=0; j<bs; j++){ 1179 v = a->a + i*bs2 + j; 1180 for (k1=0; k1<bs; k1++){ 1181 sum[j] += PetscAbsScalar(*v); v += bs; 1182 } 1183 } 1184 } 1185 /* add k_th block row to il, jl */ 1186 col = aj+jmin; 1187 if (*col == k) jmin++; 1188 if (jmin < jmax){ 1189 il[k] = jmin; 1190 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1191 } 1192 for (j=0; j<bs; j++){ 1193 if (sum[j] > *norm) *norm = sum[j]; 1194 } 1195 } 1196 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1197 } else { 1198 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1199 } 1200 PetscFunctionReturn(0); 1201 } 1202 1203 #undef __FUNCT__ 1204 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1205 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1206 { 1207 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1208 PetscErrorCode ierr; 1209 1210 PetscFunctionBegin; 1211 1212 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1213 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1214 *flg = PETSC_FALSE; 1215 PetscFunctionReturn(0); 1216 } 1217 1218 /* if the a->i are the same */ 1219 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1220 if (!*flg) { 1221 PetscFunctionReturn(0); 1222 } 1223 1224 /* if a->j are the same */ 1225 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1226 if (!*flg) { 1227 PetscFunctionReturn(0); 1228 } 1229 /* if a->a are the same */ 1230 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1231 PetscFunctionReturn(0); 1232 } 1233 1234 #undef __FUNCT__ 1235 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1236 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1237 { 1238 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1239 PetscErrorCode ierr; 1240 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1241 PetscScalar *x,zero = 0.0; 1242 MatScalar *aa,*aa_j; 1243 1244 PetscFunctionBegin; 1245 bs = A->rmap->bs; 1246 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1247 1248 aa = a->a; 1249 ai = a->i; 1250 aj = a->j; 1251 ambs = a->mbs; 1252 bs2 = a->bs2; 1253 1254 ierr = VecSet(v,zero);CHKERRQ(ierr); 1255 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1256 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1257 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1258 for (i=0; i<ambs; i++) { 1259 j=ai[i]; 1260 if (aj[j] == i) { /* if this is a diagonal element */ 1261 row = i*bs; 1262 aa_j = aa + j*bs2; 1263 if (A->factortype && bs==1){ 1264 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1265 } else { 1266 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1267 } 1268 } 1269 } 1270 1271 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1272 PetscFunctionReturn(0); 1273 } 1274 1275 #undef __FUNCT__ 1276 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1277 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1278 { 1279 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1280 PetscScalar *l,x,*li,*ri; 1281 MatScalar *aa,*v; 1282 PetscErrorCode ierr; 1283 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1284 PetscTruth flg; 1285 1286 PetscFunctionBegin; 1287 if (ll != rr){ 1288 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1289 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1290 } 1291 if (!ll) PetscFunctionReturn(0); 1292 ai = a->i; 1293 aj = a->j; 1294 aa = a->a; 1295 m = A->rmap->N; 1296 bs = A->rmap->bs; 1297 mbs = a->mbs; 1298 bs2 = a->bs2; 1299 1300 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1301 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1302 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1303 for (i=0; i<mbs; i++) { /* for each block row */ 1304 M = ai[i+1] - ai[i]; 1305 li = l + i*bs; 1306 v = aa + bs2*ai[i]; 1307 for (j=0; j<M; j++) { /* for each block */ 1308 ri = l + bs*aj[ai[i]+j]; 1309 for (k=0; k<bs; k++) { 1310 x = ri[k]; 1311 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1312 } 1313 } 1314 } 1315 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1316 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1317 PetscFunctionReturn(0); 1318 } 1319 1320 #undef __FUNCT__ 1321 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1322 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1323 { 1324 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1325 1326 PetscFunctionBegin; 1327 info->block_size = a->bs2; 1328 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1329 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1330 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1331 info->assemblies = A->num_ass; 1332 info->mallocs = A->info.mallocs; 1333 info->memory = ((PetscObject)A)->mem; 1334 if (A->factortype) { 1335 info->fill_ratio_given = A->info.fill_ratio_given; 1336 info->fill_ratio_needed = A->info.fill_ratio_needed; 1337 info->factor_mallocs = A->info.factor_mallocs; 1338 } else { 1339 info->fill_ratio_given = 0; 1340 info->fill_ratio_needed = 0; 1341 info->factor_mallocs = 0; 1342 } 1343 PetscFunctionReturn(0); 1344 } 1345 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1349 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1350 { 1351 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1352 PetscErrorCode ierr; 1353 1354 PetscFunctionBegin; 1355 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1356 PetscFunctionReturn(0); 1357 } 1358 1359 #undef __FUNCT__ 1360 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1361 /* 1362 This code does not work since it only checks the upper triangular part of 1363 the matrix. Hence it is not listed in the function table. 1364 */ 1365 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1366 { 1367 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1368 PetscErrorCode ierr; 1369 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1370 PetscReal atmp; 1371 MatScalar *aa; 1372 PetscScalar *x; 1373 PetscInt ncols,brow,bcol,krow,kcol; 1374 1375 PetscFunctionBegin; 1376 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1377 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1378 bs = A->rmap->bs; 1379 aa = a->a; 1380 ai = a->i; 1381 aj = a->j; 1382 mbs = a->mbs; 1383 1384 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1385 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1386 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1387 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1388 for (i=0; i<mbs; i++) { 1389 ncols = ai[1] - ai[0]; ai++; 1390 brow = bs*i; 1391 for (j=0; j<ncols; j++){ 1392 bcol = bs*(*aj); 1393 for (kcol=0; kcol<bs; kcol++){ 1394 col = bcol + kcol; /* col index */ 1395 for (krow=0; krow<bs; krow++){ 1396 atmp = PetscAbsScalar(*aa); aa++; 1397 row = brow + krow; /* row index */ 1398 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1399 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1400 } 1401 } 1402 aj++; 1403 } 1404 } 1405 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1406 PetscFunctionReturn(0); 1407 } 1408