1 2 #include "src/mat/impls/baij/seq/baij.h" 3 #include "src/inline/spops.h" 4 #include "src/inline/ilu.h" 5 #include "petscbt.h" 6 #include "src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 int brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,bcol_max, 14 start,end,*ai,*aj,bs,*nidx2; 15 PetscBT table; 16 PetscBT table0; 17 18 PetscFunctionBegin; 19 mbs = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = a->bs; 23 24 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 27 ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 29 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { /* for each is */ 32 isz = 0; 33 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 40 bcol_max = 0; 41 for (j=0; j<n ; ++j){ 42 brow = idx[j]/bs; /* convert the indices into block indices */ 43 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,brow)) { 45 nidx[isz++] = brow; 46 if (bcol_max < brow) bcol_max = brow; 47 } 48 } 49 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 50 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 51 52 k = 0; 53 for (j=0; j<ov; j++){ /* for each overlap */ 54 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 55 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 56 for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]); 57 58 n = isz; /* length of the updated is[i] */ 59 for (brow=0; brow<mbs; brow++){ 60 start = ai[brow]; end = ai[brow+1]; 61 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 62 for (l = start; l<end ; l++){ 63 bcol = aj[l]; 64 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 65 } 66 k++; 67 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 68 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 69 for (l = start; l<end ; l++){ 70 bcol = aj[l]; 71 if (bcol > bcol_max) break; 72 if (PetscBTLookup(table0,bcol)){ 73 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 74 break; /* for l = start; l<end ; l++) */ 75 } 76 } 77 } 78 } 79 } /* for each overlap */ 80 81 /* expand the Index Set */ 82 for (j=0; j<isz; j++) { 83 for (k=0; k<bs; k++) 84 nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 98 { 99 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 100 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 101 int row,mat_i,*mat_j,tcol,*mat_ilen; 102 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 103 int *aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscTruth flag; 107 108 PetscFunctionBegin; 109 110 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 111 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 112 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 113 114 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 115 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 116 117 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 120 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 121 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 122 /* determine lens of each row */ 123 for (i=0; i<nrows; i++) { 124 kstart = ai[irow[i]]; 125 kend = kstart + a->ilen[irow[i]]; 126 lens[i] = 0; 127 for (k=kstart; k<kend; k++) { 128 if (ssmap[aj[k]]) { 129 lens[i]++; 130 } 131 } 132 } 133 /* Create and fill new matrix */ 134 if (scall == MAT_REUSE_MATRIX) { 135 c = (Mat_SeqSBAIJ *)((*B)->data); 136 137 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 138 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 139 if (flag == PETSC_FALSE) { 140 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 141 } 142 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 143 C = *B; 144 } else { 145 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 146 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 147 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 148 } 149 c = (Mat_SeqSBAIJ *)(C->data); 150 for (i=0; i<nrows; i++) { 151 row = irow[i]; 152 kstart = ai[row]; 153 kend = kstart + a->ilen[row]; 154 mat_i = c->i[i]; 155 mat_j = c->j + mat_i; 156 mat_a = c->a + mat_i*bs2; 157 mat_ilen = c->ilen + i; 158 for (k=kstart; k<kend; k++) { 159 if ((tcol=ssmap[a->j[k]])) { 160 *mat_j++ = tcol - 1; 161 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 162 mat_a += bs2; 163 (*mat_ilen)++; 164 } 165 } 166 } 167 168 /* Free work space */ 169 ierr = PetscFree(smap);CHKERRQ(ierr); 170 ierr = PetscFree(lens);CHKERRQ(ierr); 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 174 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 175 *B = C; 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 182 { 183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 184 IS is1; 185 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 186 187 PetscFunctionBegin; 188 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 189 190 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 192 193 /* Verify if the indices corespond to each element in a block 194 and form the IS with compressed IS */ 195 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 196 iary = vary + a->mbs; 197 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 198 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 199 200 count = 0; 201 for (i=0; i<a->mbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 203 if (vary[i]==bs) iary[count++] = i; 204 } 205 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 206 207 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 208 ierr = PetscFree(vary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 211 ISDestroy(is1); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 PetscErrorCode ierr,i; 220 221 PetscFunctionBegin; 222 if (scall == MAT_INITIAL_MATRIX) { 223 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 224 } 225 226 for (i=0; i<n; i++) { 227 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 228 } 229 PetscFunctionReturn(0); 230 } 231 232 /* -------------------------------------------------------*/ 233 /* Should check that shapes of vectors and matrices match */ 234 /* -------------------------------------------------------*/ 235 #include "petscblaslapack.h" 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 239 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 240 { 241 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 242 PetscScalar *x,*z,*xb,x1,zero=0.0; 243 MatScalar *v; 244 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 245 246 PetscFunctionBegin; 247 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 248 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 250 251 v = a->a; 252 xb = x; 253 254 for (i=0; i<mbs; i++) { 255 n = ai[1] - ai[0]; /* length of i_th row of A */ 256 x1 = xb[0]; 257 ib = aj + *ai; 258 jmin = 0; 259 if (*ib == i) { /* (diag of A)*x */ 260 z[i] += *v++ * x[*ib++]; 261 jmin++; 262 } 263 for (j=jmin; j<n; j++) { 264 cval = *ib; 265 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 266 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 267 } 268 xb++; ai++; 269 } 270 271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 272 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 273 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 279 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 282 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 283 MatScalar *v; 284 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 285 286 287 PetscFunctionBegin; 288 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 289 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 290 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 291 292 v = a->a; 293 xb = x; 294 295 for (i=0; i<mbs; i++) { 296 n = ai[1] - ai[0]; /* length of i_th block row of A */ 297 x1 = xb[0]; x2 = xb[1]; 298 ib = aj + *ai; 299 jmin = 0; 300 if (*ib == i){ /* (diag of A)*x */ 301 z[2*i] += v[0]*x1 + v[2]*x2; 302 z[2*i+1] += v[2]*x1 + v[3]*x2; 303 v += 4; jmin++; 304 } 305 for (j=jmin; j<n; j++) { 306 /* (strict lower triangular part of A)*x */ 307 cval = ib[j]*2; 308 z[cval] += v[0]*x1 + v[1]*x2; 309 z[cval+1] += v[2]*x1 + v[3]*x2; 310 /* (strict upper triangular part of A)*x */ 311 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 312 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 313 v += 4; 314 } 315 xb +=2; ai++; 316 } 317 318 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 319 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 320 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 321 PetscFunctionReturn(0); 322 } 323 324 #undef __FUNCT__ 325 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 326 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 327 { 328 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 329 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 330 MatScalar *v; 331 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 332 333 334 PetscFunctionBegin; 335 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 336 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 337 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 338 339 v = a->a; 340 xb = x; 341 342 for (i=0; i<mbs; i++) { 343 n = ai[1] - ai[0]; /* length of i_th block row of A */ 344 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 345 ib = aj + *ai; 346 jmin = 0; 347 if (*ib == i){ /* (diag of A)*x */ 348 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 349 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 350 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 351 v += 9; jmin++; 352 } 353 for (j=jmin; j<n; j++) { 354 /* (strict lower triangular part of A)*x */ 355 cval = ib[j]*3; 356 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 357 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 358 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 359 /* (strict upper triangular part of A)*x */ 360 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 361 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 362 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 363 v += 9; 364 } 365 xb +=3; ai++; 366 } 367 368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 369 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 370 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 371 PetscFunctionReturn(0); 372 } 373 374 #undef __FUNCT__ 375 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 376 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 377 { 378 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 379 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 380 MatScalar *v; 381 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 382 383 PetscFunctionBegin; 384 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 385 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 386 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 387 388 v = a->a; 389 xb = x; 390 391 for (i=0; i<mbs; i++) { 392 n = ai[1] - ai[0]; /* length of i_th block row of A */ 393 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 394 ib = aj + *ai; 395 jmin = 0; 396 if (*ib == i){ /* (diag of A)*x */ 397 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 398 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 399 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 400 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 401 v += 16; jmin++; 402 } 403 for (j=jmin; j<n; j++) { 404 /* (strict lower triangular part of A)*x */ 405 cval = ib[j]*4; 406 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 407 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 408 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 409 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 410 /* (strict upper triangular part of A)*x */ 411 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 412 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 413 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 414 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 415 v += 16; 416 } 417 xb +=4; ai++; 418 } 419 420 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 421 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 422 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 423 PetscFunctionReturn(0); 424 } 425 426 #undef __FUNCT__ 427 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 428 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 429 { 430 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 431 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 432 MatScalar *v; 433 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 434 435 PetscFunctionBegin; 436 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 437 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 438 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 439 440 v = a->a; 441 xb = x; 442 443 for (i=0; i<mbs; i++) { 444 n = ai[1] - ai[0]; /* length of i_th block row of A */ 445 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 446 ib = aj + *ai; 447 jmin = 0; 448 if (*ib == i){ /* (diag of A)*x */ 449 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 450 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 451 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 452 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 453 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 454 v += 25; jmin++; 455 } 456 for (j=jmin; j<n; j++) { 457 /* (strict lower triangular part of A)*x */ 458 cval = ib[j]*5; 459 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 460 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 461 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 462 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 463 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 464 /* (strict upper triangular part of A)*x */ 465 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 466 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 467 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 468 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 469 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 470 v += 25; 471 } 472 xb +=5; ai++; 473 } 474 475 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 476 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 477 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 478 PetscFunctionReturn(0); 479 } 480 481 482 #undef __FUNCT__ 483 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 484 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 485 { 486 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 487 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 488 MatScalar *v; 489 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 490 491 PetscFunctionBegin; 492 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 493 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 494 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 495 496 v = a->a; 497 xb = x; 498 499 for (i=0; i<mbs; i++) { 500 n = ai[1] - ai[0]; /* length of i_th block row of A */ 501 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 502 ib = aj + *ai; 503 jmin = 0; 504 if (*ib == i){ /* (diag of A)*x */ 505 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 506 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 507 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 508 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 509 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 510 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 511 v += 36; jmin++; 512 } 513 for (j=jmin; j<n; j++) { 514 /* (strict lower triangular part of A)*x */ 515 cval = ib[j]*6; 516 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 517 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 518 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 519 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 520 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 521 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 522 /* (strict upper triangular part of A)*x */ 523 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 524 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 525 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 526 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 527 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 528 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 529 v += 36; 530 } 531 xb +=6; ai++; 532 } 533 534 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 536 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 537 PetscFunctionReturn(0); 538 } 539 #undef __FUNCT__ 540 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 541 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 542 { 543 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 544 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 545 MatScalar *v; 546 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 547 548 PetscFunctionBegin; 549 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 550 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 551 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 552 553 v = a->a; 554 xb = x; 555 556 for (i=0; i<mbs; i++) { 557 n = ai[1] - ai[0]; /* length of i_th block row of A */ 558 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 559 ib = aj + *ai; 560 jmin = 0; 561 if (*ib == i){ /* (diag of A)*x */ 562 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 563 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 564 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 565 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 566 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 567 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 568 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 569 v += 49; jmin++; 570 } 571 for (j=jmin; j<n; j++) { 572 /* (strict lower triangular part of A)*x */ 573 cval = ib[j]*7; 574 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 575 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 576 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 577 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 578 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 579 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 580 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 581 /* (strict upper triangular part of A)*x */ 582 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 583 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 584 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 585 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 586 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 587 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 588 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 589 v += 49; 590 } 591 xb +=7; ai++; 592 } 593 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 594 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 595 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 596 PetscFunctionReturn(0); 597 } 598 599 /* 600 This will not work with MatScalar == float because it calls the BLAS 601 */ 602 #undef __FUNCT__ 603 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 604 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 605 { 606 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 607 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 608 MatScalar *v; 609 PetscErrorCode ierr; 610 int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 611 int ncols,k; 612 613 PetscFunctionBegin; 614 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 615 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 616 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 617 618 aj = a->j; 619 v = a->a; 620 ii = a->i; 621 622 if (!a->mult_work) { 623 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 624 } 625 work = a->mult_work; 626 627 for (i=0; i<mbs; i++) { 628 n = ii[1] - ii[0]; ncols = n*bs; 629 workt = work; idx=aj+ii[0]; 630 631 /* upper triangular part */ 632 for (j=0; j<n; j++) { 633 xb = x_ptr + bs*(*idx++); 634 for (k=0; k<bs; k++) workt[k] = xb[k]; 635 workt += bs; 636 } 637 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 638 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 639 640 /* strict lower triangular part */ 641 idx = aj+ii[0]; 642 if (*idx == i){ 643 ncols -= bs; v += bs2; idx++; n--; 644 } 645 646 if (ncols > 0){ 647 workt = work; 648 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 649 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 650 for (j=0; j<n; j++) { 651 zb = z_ptr + bs*(*idx++); 652 for (k=0; k<bs; k++) zb[k] += workt[k] ; 653 workt += bs; 654 } 655 } 656 x += bs; v += n*bs2; z += bs; ii++; 657 } 658 659 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 660 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 661 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 662 PetscFunctionReturn(0); 663 } 664 665 #undef __FUNCT__ 666 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 667 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 668 { 669 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 670 PetscScalar *x,*y,*z,*xb,x1; 671 MatScalar *v; 672 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 673 674 PetscFunctionBegin; 675 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 676 if (yy != xx) { 677 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 678 } else { 679 y = x; 680 } 681 if (zz != yy) { 682 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 683 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 684 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 685 } else { 686 z = y; 687 } 688 689 v = a->a; 690 xb = x; 691 692 for (i=0; i<mbs; i++) { 693 n = ai[1] - ai[0]; /* length of i_th row of A */ 694 x1 = xb[0]; 695 ib = aj + *ai; 696 jmin = 0; 697 if (*ib == i) { /* (diag of A)*x */ 698 z[i] += *v++ * x[*ib++]; jmin++; 699 } 700 for (j=jmin; j<n; j++) { 701 cval = *ib; 702 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 703 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 704 } 705 xb++; ai++; 706 } 707 708 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 709 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 710 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 711 712 PetscLogFlops(2*(a->nz*2 - A->m)); 713 PetscFunctionReturn(0); 714 } 715 716 #undef __FUNCT__ 717 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 718 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 719 { 720 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 721 PetscScalar *x,*y,*z,*xb,x1,x2; 722 MatScalar *v; 723 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 724 725 PetscFunctionBegin; 726 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 727 if (yy != xx) { 728 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 729 } else { 730 y = x; 731 } 732 if (zz != yy) { 733 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 734 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 735 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 736 } else { 737 z = y; 738 } 739 740 v = a->a; 741 xb = x; 742 743 for (i=0; i<mbs; i++) { 744 n = ai[1] - ai[0]; /* length of i_th block row of A */ 745 x1 = xb[0]; x2 = xb[1]; 746 ib = aj + *ai; 747 jmin = 0; 748 if (*ib == i){ /* (diag of A)*x */ 749 z[2*i] += v[0]*x1 + v[2]*x2; 750 z[2*i+1] += v[2]*x1 + v[3]*x2; 751 v += 4; jmin++; 752 } 753 for (j=jmin; j<n; j++) { 754 /* (strict lower triangular part of A)*x */ 755 cval = ib[j]*2; 756 z[cval] += v[0]*x1 + v[1]*x2; 757 z[cval+1] += v[2]*x1 + v[3]*x2; 758 /* (strict upper triangular part of A)*x */ 759 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 760 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 761 v += 4; 762 } 763 xb +=2; ai++; 764 } 765 766 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 767 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 768 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 769 770 PetscLogFlops(4*(a->nz*2 - A->m)); 771 PetscFunctionReturn(0); 772 } 773 774 #undef __FUNCT__ 775 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 776 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 777 { 778 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 779 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 780 MatScalar *v; 781 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 782 783 PetscFunctionBegin; 784 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 785 if (yy != xx) { 786 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 787 } else { 788 y = x; 789 } 790 if (zz != yy) { 791 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 792 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 793 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 794 } else { 795 z = y; 796 } 797 798 v = a->a; 799 xb = x; 800 801 for (i=0; i<mbs; i++) { 802 n = ai[1] - ai[0]; /* length of i_th block row of A */ 803 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 804 ib = aj + *ai; 805 jmin = 0; 806 if (*ib == i){ /* (diag of A)*x */ 807 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 808 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 809 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 810 v += 9; jmin++; 811 } 812 for (j=jmin; j<n; j++) { 813 /* (strict lower triangular part of A)*x */ 814 cval = ib[j]*3; 815 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 816 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 817 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 818 /* (strict upper triangular part of A)*x */ 819 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 820 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 821 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 822 v += 9; 823 } 824 xb +=3; ai++; 825 } 826 827 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 828 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 829 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 830 831 PetscLogFlops(18*(a->nz*2 - A->m)); 832 PetscFunctionReturn(0); 833 } 834 835 #undef __FUNCT__ 836 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 837 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 838 { 839 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 840 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 841 MatScalar *v; 842 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 843 844 PetscFunctionBegin; 845 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 846 if (yy != xx) { 847 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 848 } else { 849 y = x; 850 } 851 if (zz != yy) { 852 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 853 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 854 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 855 } else { 856 z = y; 857 } 858 859 v = a->a; 860 xb = x; 861 862 for (i=0; i<mbs; i++) { 863 n = ai[1] - ai[0]; /* length of i_th block row of A */ 864 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 865 ib = aj + *ai; 866 jmin = 0; 867 if (*ib == i){ /* (diag of A)*x */ 868 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 869 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 870 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 871 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 872 v += 16; jmin++; 873 } 874 for (j=jmin; j<n; j++) { 875 /* (strict lower triangular part of A)*x */ 876 cval = ib[j]*4; 877 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 878 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 879 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 880 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 881 /* (strict upper triangular part of A)*x */ 882 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 883 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 884 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 885 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 886 v += 16; 887 } 888 xb +=4; ai++; 889 } 890 891 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 892 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 893 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 894 895 PetscLogFlops(32*(a->nz*2 - A->m)); 896 PetscFunctionReturn(0); 897 } 898 899 #undef __FUNCT__ 900 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 901 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 902 { 903 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 904 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 905 MatScalar *v; 906 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 907 908 PetscFunctionBegin; 909 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 910 if (yy != xx) { 911 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 912 } else { 913 y = x; 914 } 915 if (zz != yy) { 916 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 917 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 918 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 919 } else { 920 z = y; 921 } 922 923 v = a->a; 924 xb = x; 925 926 for (i=0; i<mbs; i++) { 927 n = ai[1] - ai[0]; /* length of i_th block row of A */ 928 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 929 ib = aj + *ai; 930 jmin = 0; 931 if (*ib == i){ /* (diag of A)*x */ 932 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 933 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 934 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 935 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 936 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 937 v += 25; jmin++; 938 } 939 for (j=jmin; j<n; j++) { 940 /* (strict lower triangular part of A)*x */ 941 cval = ib[j]*5; 942 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 943 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 944 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 945 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 946 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 947 /* (strict upper triangular part of A)*x */ 948 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 949 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 950 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 951 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 952 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 953 v += 25; 954 } 955 xb +=5; ai++; 956 } 957 958 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 959 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 960 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 961 962 PetscLogFlops(50*(a->nz*2 - A->m)); 963 PetscFunctionReturn(0); 964 } 965 #undef __FUNCT__ 966 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 967 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 968 { 969 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 970 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 971 MatScalar *v; 972 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 973 974 PetscFunctionBegin; 975 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 976 if (yy != xx) { 977 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 978 } else { 979 y = x; 980 } 981 if (zz != yy) { 982 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 983 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 984 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 985 } else { 986 z = y; 987 } 988 989 v = a->a; 990 xb = x; 991 992 for (i=0; i<mbs; i++) { 993 n = ai[1] - ai[0]; /* length of i_th block row of A */ 994 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 995 ib = aj + *ai; 996 jmin = 0; 997 if (*ib == i){ /* (diag of A)*x */ 998 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 999 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1000 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1001 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1002 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1003 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1004 v += 36; jmin++; 1005 } 1006 for (j=jmin; j<n; j++) { 1007 /* (strict lower triangular part of A)*x */ 1008 cval = ib[j]*6; 1009 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1010 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1011 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1012 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1013 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1014 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1015 /* (strict upper triangular part of A)*x */ 1016 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1017 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1018 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1019 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1020 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1021 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1022 v += 36; 1023 } 1024 xb +=6; ai++; 1025 } 1026 1027 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1028 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1029 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1030 1031 PetscLogFlops(72*(a->nz*2 - A->m)); 1032 PetscFunctionReturn(0); 1033 } 1034 1035 #undef __FUNCT__ 1036 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1037 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1038 { 1039 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1040 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1041 MatScalar *v; 1042 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 1043 1044 PetscFunctionBegin; 1045 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1046 if (yy != xx) { 1047 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1048 } else { 1049 y = x; 1050 } 1051 if (zz != yy) { 1052 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1053 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1054 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1055 } else { 1056 z = y; 1057 } 1058 1059 v = a->a; 1060 xb = x; 1061 1062 for (i=0; i<mbs; i++) { 1063 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1064 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1065 ib = aj + *ai; 1066 jmin = 0; 1067 if (*ib == i){ /* (diag of A)*x */ 1068 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1069 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1070 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1071 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1072 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1073 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1074 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1075 v += 49; jmin++; 1076 } 1077 for (j=jmin; j<n; j++) { 1078 /* (strict lower triangular part of A)*x */ 1079 cval = ib[j]*7; 1080 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1081 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1082 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1083 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1084 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1085 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1086 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1087 /* (strict upper triangular part of A)*x */ 1088 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1089 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1090 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1091 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1092 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1093 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1094 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1095 v += 49; 1096 } 1097 xb +=7; ai++; 1098 } 1099 1100 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1101 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1102 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1103 1104 PetscLogFlops(98*(a->nz*2 - A->m)); 1105 PetscFunctionReturn(0); 1106 } 1107 1108 #undef __FUNCT__ 1109 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1110 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1111 { 1112 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1113 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1114 MatScalar *v; 1115 PetscErrorCode ierr; 1116 int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1117 int ncols,k; 1118 1119 PetscFunctionBegin; 1120 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1121 if (yy != xx) { 1122 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1123 } else { 1124 y = x; 1125 } 1126 if (zz != yy) { 1127 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1128 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1129 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1130 } else { 1131 z = y; 1132 } 1133 1134 aj = a->j; 1135 v = a->a; 1136 ii = a->i; 1137 1138 if (!a->mult_work) { 1139 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1140 } 1141 work = a->mult_work; 1142 1143 1144 for (i=0; i<mbs; i++) { 1145 n = ii[1] - ii[0]; ncols = n*bs; 1146 workt = work; idx=aj+ii[0]; 1147 1148 /* upper triangular part */ 1149 for (j=0; j<n; j++) { 1150 xb = x_ptr + bs*(*idx++); 1151 for (k=0; k<bs; k++) workt[k] = xb[k]; 1152 workt += bs; 1153 } 1154 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1155 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1156 1157 /* strict lower triangular part */ 1158 idx = aj+ii[0]; 1159 if (*idx == i){ 1160 ncols -= bs; v += bs2; idx++; n--; 1161 } 1162 if (ncols > 0){ 1163 workt = work; 1164 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1165 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1166 for (j=0; j<n; j++) { 1167 zb = z_ptr + bs*(*idx++); 1168 /* idx++; */ 1169 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1170 workt += bs; 1171 } 1172 } 1173 1174 x += bs; v += n*bs2; z += bs; ii++; 1175 } 1176 1177 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1178 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1179 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1180 1181 PetscLogFlops(2*(a->nz*2 - A->m)); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 #undef __FUNCT__ 1186 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1187 PetscErrorCode MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1188 { 1189 PetscErrorCode ierr; 1190 1191 PetscFunctionBegin; 1192 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1193 PetscFunctionReturn(0); 1194 } 1195 1196 #undef __FUNCT__ 1197 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1198 PetscErrorCode MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1199 1200 { 1201 PetscErrorCode ierr; 1202 1203 PetscFunctionBegin; 1204 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1205 PetscFunctionReturn(0); 1206 } 1207 1208 #undef __FUNCT__ 1209 #define __FUNCT__ "MatScale_SeqSBAIJ" 1210 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1211 { 1212 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1213 int one = 1,totalnz = a->bs2*a->nz; 1214 1215 PetscFunctionBegin; 1216 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1217 PetscLogFlops(totalnz); 1218 PetscFunctionReturn(0); 1219 } 1220 1221 #undef __FUNCT__ 1222 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1223 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1224 { 1225 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1226 MatScalar *v = a->a; 1227 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1228 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1229 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1230 1231 PetscFunctionBegin; 1232 if (type == NORM_FROBENIUS) { 1233 for (k=0; k<mbs; k++){ 1234 jmin = a->i[k]; jmax = a->i[k+1]; 1235 col = aj + jmin; 1236 if (*col == k){ /* diagonal block */ 1237 for (i=0; i<bs2; i++){ 1238 #if defined(PETSC_USE_COMPLEX) 1239 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1240 #else 1241 sum_diag += (*v)*(*v); v++; 1242 #endif 1243 } 1244 jmin++; 1245 } 1246 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1247 for (i=0; i<bs2; i++){ 1248 #if defined(PETSC_USE_COMPLEX) 1249 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1250 #else 1251 sum_off += (*v)*(*v); v++; 1252 #endif 1253 } 1254 } 1255 } 1256 *norm = sqrt(sum_diag + 2*sum_off); 1257 1258 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1259 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1260 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1261 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1262 for (i=0; i<mbs; i++) { 1263 jl[i] = mbs; il[0] = 0; 1264 } 1265 1266 *norm = 0.0; 1267 for (k=0; k<mbs; k++) { /* k_th block row */ 1268 for (j=0; j<bs; j++) sum[j]=0.0; 1269 1270 /*-- col sum --*/ 1271 i = jl[k]; /* first |A(i,k)| to be added */ 1272 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1273 at step k */ 1274 while (i<mbs){ 1275 nexti = jl[i]; /* next block row to be added */ 1276 ik = il[i]; /* block index of A(i,k) in the array a */ 1277 for (j=0; j<bs; j++){ 1278 v = a->a + ik*bs2 + j*bs; 1279 for (k1=0; k1<bs; k1++) { 1280 sum[j] += PetscAbsScalar(*v); v++; 1281 } 1282 } 1283 /* update il, jl */ 1284 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1285 jmax = a->i[i+1]; 1286 if (jmin < jmax){ 1287 il[i] = jmin; 1288 j = a->j[jmin]; 1289 jl[i] = jl[j]; jl[j]=i; 1290 } 1291 i = nexti; 1292 } 1293 1294 /*-- row sum --*/ 1295 jmin = a->i[k]; jmax = a->i[k+1]; 1296 for (i=jmin; i<jmax; i++) { 1297 for (j=0; j<bs; j++){ 1298 v = a->a + i*bs2 + j; 1299 for (k1=0; k1<bs; k1++){ 1300 sum[j] += PetscAbsScalar(*v); 1301 v += bs; 1302 } 1303 } 1304 } 1305 /* add k_th block row to il, jl */ 1306 col = aj+jmin; 1307 if (*col == k) jmin++; 1308 if (jmin < jmax){ 1309 il[k] = jmin; 1310 j = a->j[jmin]; 1311 jl[k] = jl[j]; jl[j] = k; 1312 } 1313 for (j=0; j<bs; j++){ 1314 if (sum[j] > *norm) *norm = sum[j]; 1315 } 1316 } 1317 ierr = PetscFree(il);CHKERRQ(ierr); 1318 ierr = PetscFree(jl);CHKERRQ(ierr); 1319 ierr = PetscFree(sum);CHKERRQ(ierr); 1320 } else { 1321 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1322 } 1323 PetscFunctionReturn(0); 1324 } 1325 1326 #undef __FUNCT__ 1327 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1328 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1329 { 1330 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1331 PetscErrorCode ierr; 1332 1333 PetscFunctionBegin; 1334 1335 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1336 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1337 *flg = PETSC_FALSE; 1338 PetscFunctionReturn(0); 1339 } 1340 1341 /* if the a->i are the same */ 1342 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1343 if (*flg == PETSC_FALSE) { 1344 PetscFunctionReturn(0); 1345 } 1346 1347 /* if a->j are the same */ 1348 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1349 if (*flg == PETSC_FALSE) { 1350 PetscFunctionReturn(0); 1351 } 1352 /* if a->a are the same */ 1353 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1354 1355 PetscFunctionReturn(0); 1356 } 1357 1358 #undef __FUNCT__ 1359 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1360 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1361 { 1362 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1363 PetscErrorCode ierr; 1364 int i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1365 PetscScalar *x,zero = 0.0; 1366 MatScalar *aa,*aa_j; 1367 1368 PetscFunctionBegin; 1369 bs = a->bs; 1370 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1371 1372 aa = a->a; 1373 ai = a->i; 1374 aj = a->j; 1375 ambs = a->mbs; 1376 bs2 = a->bs2; 1377 1378 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1379 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1380 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1381 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1382 for (i=0; i<ambs; i++) { 1383 j=ai[i]; 1384 if (aj[j] == i) { /* if this is a diagonal element */ 1385 row = i*bs; 1386 aa_j = aa + j*bs2; 1387 if (A->factor && bs==1){ 1388 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1389 } else { 1390 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1391 } 1392 } 1393 } 1394 1395 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1396 PetscFunctionReturn(0); 1397 } 1398 1399 #undef __FUNCT__ 1400 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1401 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1402 { 1403 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1404 PetscScalar *l,*r,x,*li,*ri; 1405 MatScalar *aa,*v; 1406 PetscErrorCode ierr; 1407 int i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1408 1409 PetscFunctionBegin; 1410 ai = a->i; 1411 aj = a->j; 1412 aa = a->a; 1413 m = A->m; 1414 bs = a->bs; 1415 mbs = a->mbs; 1416 bs2 = a->bs2; 1417 1418 if (ll != rr) { 1419 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1420 } 1421 if (ll) { 1422 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1423 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1424 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1425 for (i=0; i<mbs; i++) { /* for each block row */ 1426 M = ai[i+1] - ai[i]; 1427 li = l + i*bs; 1428 v = aa + bs2*ai[i]; 1429 for (j=0; j<M; j++) { /* for each block */ 1430 for (k=0; k<bs2; k++) { 1431 (*v++) *= li[k%bs]; 1432 } 1433 #ifdef CONT 1434 /* will be used to replace the above loop */ 1435 ri = l + bs*aj[ai[i]+j]; 1436 for (k=0; k<bs; k++) { /* column value */ 1437 x = ri[k]; 1438 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1439 } 1440 #endif 1441 1442 } 1443 } 1444 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1445 PetscLogFlops(2*a->nz); 1446 } 1447 /* will be deleted */ 1448 if (rr) { 1449 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 1450 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1451 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1452 for (i=0; i<mbs; i++) { /* for each block row */ 1453 M = ai[i+1] - ai[i]; 1454 v = aa + bs2*ai[i]; 1455 for (j=0; j<M; j++) { /* for each block */ 1456 ri = r + bs*aj[ai[i]+j]; 1457 for (k=0; k<bs; k++) { 1458 x = ri[k]; 1459 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1460 } 1461 } 1462 } 1463 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 1464 PetscLogFlops(a->nz); 1465 } 1466 PetscFunctionReturn(0); 1467 } 1468 1469 #undef __FUNCT__ 1470 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1471 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1472 { 1473 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1474 1475 PetscFunctionBegin; 1476 info->rows_global = (double)A->m; 1477 info->columns_global = (double)A->m; 1478 info->rows_local = (double)A->m; 1479 info->columns_local = (double)A->m; 1480 info->block_size = a->bs2; 1481 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1482 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1483 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1484 info->assemblies = A->num_ass; 1485 info->mallocs = a->reallocs; 1486 info->memory = A->mem; 1487 if (A->factor) { 1488 info->fill_ratio_given = A->info.fill_ratio_given; 1489 info->fill_ratio_needed = A->info.fill_ratio_needed; 1490 info->factor_mallocs = A->info.factor_mallocs; 1491 } else { 1492 info->fill_ratio_given = 0; 1493 info->fill_ratio_needed = 0; 1494 info->factor_mallocs = 0; 1495 } 1496 PetscFunctionReturn(0); 1497 } 1498 1499 1500 #undef __FUNCT__ 1501 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1502 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1503 { 1504 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1505 PetscErrorCode ierr; 1506 1507 PetscFunctionBegin; 1508 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1509 PetscFunctionReturn(0); 1510 } 1511 1512 #undef __FUNCT__ 1513 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1514 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1515 { 1516 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1517 PetscErrorCode ierr; 1518 int i,j,n,row,col,bs,*ai,*aj,mbs; 1519 PetscReal atmp; 1520 MatScalar *aa; 1521 PetscScalar zero = 0.0,*x; 1522 int ncols,brow,bcol,krow,kcol; 1523 1524 PetscFunctionBegin; 1525 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1526 bs = a->bs; 1527 aa = a->a; 1528 ai = a->i; 1529 aj = a->j; 1530 mbs = a->mbs; 1531 1532 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1533 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1534 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1535 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1536 for (i=0; i<mbs; i++) { 1537 ncols = ai[1] - ai[0]; ai++; 1538 brow = bs*i; 1539 for (j=0; j<ncols; j++){ 1540 bcol = bs*(*aj); 1541 for (kcol=0; kcol<bs; kcol++){ 1542 col = bcol + kcol; /* col index */ 1543 for (krow=0; krow<bs; krow++){ 1544 atmp = PetscAbsScalar(*aa); aa++; 1545 row = brow + krow; /* row index */ 1546 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1547 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1548 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1549 } 1550 } 1551 aj++; 1552 } 1553 } 1554 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1555 PetscFunctionReturn(0); 1556 } 1557