1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 int brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol, 15 start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 PetscBT table0; 18 19 PetscFunctionBegin; 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = a->bs; 24 25 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 30 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 31 32 for (i=0; i<is_max; i++) { /* for each is */ 33 isz = 0; 34 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 41 for (j=0; j<n ; ++j){ 42 brow = idx[j]/bs; /* convert the indices into block indices */ 43 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,brow)) { nidx[isz++] = brow;} 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap */ 51 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 52 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 53 for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]); 54 55 n = isz; /* length of the updated is[i] */ 56 for (brow=0; brow<mbs; brow++){ 57 start = ai[brow]; end = ai[brow+1]; 58 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 59 for (l = start; l<end ; l++){ 60 bcol = aj[l]; 61 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 62 } 63 k++; 64 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 65 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 66 for (l = start; l<end ; l++){ 67 bcol = aj[l]; 68 if (PetscBTLookup(table0,bcol)){ 69 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 70 break; /* for l = start; l<end ; l++) */ 71 } 72 } 73 } 74 } 75 } /* for each overlap */ 76 77 /* expand the Index Set */ 78 for (j=0; j<isz; j++) { 79 for (k=0; k<bs; k++) 80 nidx2[j*bs+k] = nidx[j]*bs+k; 81 } 82 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 83 } 84 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 85 ierr = PetscFree(nidx);CHKERRQ(ierr); 86 ierr = PetscFree(nidx2);CHKERRQ(ierr); 87 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 88 #ifdef VERSION_1 89 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 90 int brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol, 91 start,end,*ai,*aj,bs,*nidx2,*colsearch; 92 PetscBT table; 93 94 PetscFunctionBegin; 95 mbs = a->mbs; 96 ai = a->i; 97 aj = a->j; 98 bs = a->bs; 99 100 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 101 102 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 103 ierr = PetscMalloc((2*mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 104 colsearch = nidx + mbs + 1; 105 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 106 107 for (i=0; i<is_max; i++) { 108 109 /* Initialise the two local arrays */ 110 isz = 0; 111 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 112 113 /* Extract the indices, assume there can be duplicate entries */ 114 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 115 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 116 117 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 118 for (j=0; j<n ; ++j){ 119 bcol = idx[j]/bs; /* convert the indices into block indices */ 120 if (bcol >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 121 if(!PetscBTLookupSet(table,bcol)) { nidx[isz++] = bcol;} 122 } 123 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 124 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 125 126 k = 0; 127 for (j=0; j<ov; j++){ /* for each overlap */ 128 /* initialize colsearch - 129 colsearch[i] points to the possible non-zero (i,active_col)-th entry in the array aj */ 130 for (brow=0; brow<mbs; brow++) colsearch[brow] = ai[brow]; 131 132 /* sort nidx[k:isz-1] - needed by col search */ 133 ierr = PetscSortInt(isz-k,nidx+k);CHKERRQ(ierr); 134 135 n = isz; 136 for (; k<n ; k++){ /* do only those brows in nidx[k], which are not done yet */ 137 brow = nidx[k]; 138 /* search block column */ 139 for (l=0; l<brow; l++){ 140 if (colsearch[l]<ai[l+1]){ 141 bcol = aj[colsearch[l]]; 142 while (bcol < brow ) { 143 colsearch[l]++; 144 if (colsearch[l]<ai[l+1]){ 145 bcol = aj[colsearch[l]]; 146 } else { 147 bcol = mbs; break; 148 } 149 } 150 if (bcol==brow){ 151 if (!PetscBTLookupSet(table,l)) {nidx[isz++] = l;} 152 colsearch[l]++; 153 } 154 } 155 } 156 /* search block row */ 157 start = ai[brow]; 158 end = ai[brow+1]; 159 for (l = start; l<end ; l++){ 160 bcol = aj[l]; 161 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 162 } 163 } 164 } /* for each overlap */ 165 166 /* expand the Index Set */ 167 for (j=0; j<isz; j++) { 168 for (k=0; k<bs; k++) 169 nidx2[j*bs+k] = nidx[j]*bs+k; 170 } 171 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 172 } 173 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 174 ierr = PetscFree(nidx);CHKERRQ(ierr); 175 ierr = PetscFree(nidx2);CHKERRQ(ierr); 176 #endif /* VERSION_1 */ 177 PetscFunctionReturn(0); 178 } 179 180 #undef __FUNCT__ 181 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 182 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 183 { 184 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 185 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 186 int row,mat_i,*mat_j,tcol,*mat_ilen; 187 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 188 int *aj = a->j,*ai = a->i; 189 MatScalar *mat_a; 190 Mat C; 191 PetscTruth flag; 192 193 PetscFunctionBegin; 194 195 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 196 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 197 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 198 199 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 200 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 201 202 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 203 ssmap = smap; 204 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 205 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 206 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 207 /* determine lens of each row */ 208 for (i=0; i<nrows; i++) { 209 kstart = ai[irow[i]]; 210 kend = kstart + a->ilen[irow[i]]; 211 lens[i] = 0; 212 for (k=kstart; k<kend; k++) { 213 if (ssmap[aj[k]]) { 214 lens[i]++; 215 } 216 } 217 } 218 /* Create and fill new matrix */ 219 if (scall == MAT_REUSE_MATRIX) { 220 c = (Mat_SeqSBAIJ *)((*B)->data); 221 222 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 223 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 224 if (flag == PETSC_FALSE) { 225 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 226 } 227 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 228 C = *B; 229 } else { 230 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 231 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 232 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 233 } 234 c = (Mat_SeqSBAIJ *)(C->data); 235 for (i=0; i<nrows; i++) { 236 row = irow[i]; 237 kstart = ai[row]; 238 kend = kstart + a->ilen[row]; 239 mat_i = c->i[i]; 240 mat_j = c->j + mat_i; 241 mat_a = c->a + mat_i*bs2; 242 mat_ilen = c->ilen + i; 243 for (k=kstart; k<kend; k++) { 244 if ((tcol=ssmap[a->j[k]])) { 245 *mat_j++ = tcol - 1; 246 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 247 mat_a += bs2; 248 (*mat_ilen)++; 249 } 250 } 251 } 252 253 /* Free work space */ 254 ierr = PetscFree(smap);CHKERRQ(ierr); 255 ierr = PetscFree(lens);CHKERRQ(ierr); 256 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 257 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 258 259 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 260 *B = C; 261 PetscFunctionReturn(0); 262 } 263 264 #undef __FUNCT__ 265 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 266 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 267 { 268 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 269 IS is1; 270 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 271 272 PetscFunctionBegin; 273 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 274 275 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 276 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 277 278 /* Verify if the indices corespond to each element in a block 279 and form the IS with compressed IS */ 280 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 281 iary = vary + a->mbs; 282 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 283 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 284 285 count = 0; 286 for (i=0; i<a->mbs; i++) { 287 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 288 if (vary[i]==bs) iary[count++] = i; 289 } 290 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 291 292 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 293 ierr = PetscFree(vary);CHKERRQ(ierr); 294 295 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 296 ISDestroy(is1); 297 PetscFunctionReturn(0); 298 } 299 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 302 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 303 { 304 int ierr,i; 305 306 PetscFunctionBegin; 307 if (scall == MAT_INITIAL_MATRIX) { 308 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<n; i++) { 312 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 313 } 314 PetscFunctionReturn(0); 315 } 316 317 /* -------------------------------------------------------*/ 318 /* Should check that shapes of vectors and matrices match */ 319 /* -------------------------------------------------------*/ 320 #include "petscblaslapack.h" 321 322 #undef __FUNCT__ 323 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 324 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 325 { 326 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 327 PetscScalar *x,*z,*xb,x1,zero=0.0; 328 MatScalar *v; 329 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 330 331 PetscFunctionBegin; 332 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 333 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 334 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 335 336 v = a->a; 337 xb = x; 338 339 for (i=0; i<mbs; i++) { 340 n = ai[1] - ai[0]; /* length of i_th row of A */ 341 x1 = xb[0]; 342 ib = aj + *ai; 343 jmin = 0; 344 if (*ib == i) { /* (diag of A)*x */ 345 z[i] += *v++ * x[*ib++]; 346 jmin++; 347 } 348 for (j=jmin; j<n; j++) { 349 cval = *ib; 350 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 351 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 352 } 353 xb++; ai++; 354 } 355 356 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 357 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 358 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 359 PetscFunctionReturn(0); 360 } 361 362 #undef __FUNCT__ 363 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 364 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 365 { 366 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 367 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 368 MatScalar *v; 369 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 370 371 372 PetscFunctionBegin; 373 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 374 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 375 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 376 377 v = a->a; 378 xb = x; 379 380 for (i=0; i<mbs; i++) { 381 n = ai[1] - ai[0]; /* length of i_th block row of A */ 382 x1 = xb[0]; x2 = xb[1]; 383 ib = aj + *ai; 384 jmin = 0; 385 if (*ib == i){ /* (diag of A)*x */ 386 z[2*i] += v[0]*x1 + v[2]*x2; 387 z[2*i+1] += v[2]*x1 + v[3]*x2; 388 v += 4; jmin++; 389 } 390 for (j=jmin; j<n; j++) { 391 /* (strict lower triangular part of A)*x */ 392 cval = ib[j]*2; 393 z[cval] += v[0]*x1 + v[1]*x2; 394 z[cval+1] += v[2]*x1 + v[3]*x2; 395 /* (strict upper triangular part of A)*x */ 396 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 397 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 398 v += 4; 399 } 400 xb +=2; ai++; 401 } 402 403 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 404 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 405 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 406 PetscFunctionReturn(0); 407 } 408 409 #undef __FUNCT__ 410 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 411 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 412 { 413 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 414 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 415 MatScalar *v; 416 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 417 418 419 PetscFunctionBegin; 420 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 421 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 422 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 423 424 v = a->a; 425 xb = x; 426 427 for (i=0; i<mbs; i++) { 428 n = ai[1] - ai[0]; /* length of i_th block row of A */ 429 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 430 ib = aj + *ai; 431 jmin = 0; 432 if (*ib == i){ /* (diag of A)*x */ 433 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 434 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 435 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 436 v += 9; jmin++; 437 } 438 for (j=jmin; j<n; j++) { 439 /* (strict lower triangular part of A)*x */ 440 cval = ib[j]*3; 441 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 442 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 443 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 444 /* (strict upper triangular part of A)*x */ 445 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 446 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 447 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 448 v += 9; 449 } 450 xb +=3; ai++; 451 } 452 453 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 454 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 455 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 456 PetscFunctionReturn(0); 457 } 458 459 #undef __FUNCT__ 460 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 461 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 462 { 463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 464 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 465 MatScalar *v; 466 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 467 468 PetscFunctionBegin; 469 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 470 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 471 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 472 473 v = a->a; 474 xb = x; 475 476 for (i=0; i<mbs; i++) { 477 n = ai[1] - ai[0]; /* length of i_th block row of A */ 478 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 479 ib = aj + *ai; 480 jmin = 0; 481 if (*ib == i){ /* (diag of A)*x */ 482 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 483 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 484 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 485 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 486 v += 16; jmin++; 487 } 488 for (j=jmin; j<n; j++) { 489 /* (strict lower triangular part of A)*x */ 490 cval = ib[j]*4; 491 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 492 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 493 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 494 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 495 /* (strict upper triangular part of A)*x */ 496 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 497 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 498 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 499 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 500 v += 16; 501 } 502 xb +=4; ai++; 503 } 504 505 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 506 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 507 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 508 PetscFunctionReturn(0); 509 } 510 511 #undef __FUNCT__ 512 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 513 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 514 { 515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 516 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 517 MatScalar *v; 518 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 519 520 PetscFunctionBegin; 521 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 522 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 523 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 524 525 v = a->a; 526 xb = x; 527 528 for (i=0; i<mbs; i++) { 529 n = ai[1] - ai[0]; /* length of i_th block row of A */ 530 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 531 ib = aj + *ai; 532 jmin = 0; 533 if (*ib == i){ /* (diag of A)*x */ 534 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 535 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 536 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 537 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 538 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 539 v += 25; jmin++; 540 } 541 for (j=jmin; j<n; j++) { 542 /* (strict lower triangular part of A)*x */ 543 cval = ib[j]*5; 544 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 545 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 546 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 547 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 548 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 549 /* (strict upper triangular part of A)*x */ 550 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 551 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 552 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 553 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 554 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 555 v += 25; 556 } 557 xb +=5; ai++; 558 } 559 560 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 561 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 562 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 563 PetscFunctionReturn(0); 564 } 565 566 567 #undef __FUNCT__ 568 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 569 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 570 { 571 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 572 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 573 MatScalar *v; 574 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 575 576 PetscFunctionBegin; 577 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 578 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 579 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 580 581 v = a->a; 582 xb = x; 583 584 for (i=0; i<mbs; i++) { 585 n = ai[1] - ai[0]; /* length of i_th block row of A */ 586 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 587 ib = aj + *ai; 588 jmin = 0; 589 if (*ib == i){ /* (diag of A)*x */ 590 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 591 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 592 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 593 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 594 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 595 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 596 v += 36; jmin++; 597 } 598 for (j=jmin; j<n; j++) { 599 /* (strict lower triangular part of A)*x */ 600 cval = ib[j]*6; 601 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 602 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 603 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 604 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 605 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 606 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 607 /* (strict upper triangular part of A)*x */ 608 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 609 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 610 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 611 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 612 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 613 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 614 v += 36; 615 } 616 xb +=6; ai++; 617 } 618 619 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 620 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 621 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 622 PetscFunctionReturn(0); 623 } 624 #undef __FUNCT__ 625 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 626 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 627 { 628 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 629 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 630 MatScalar *v; 631 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 632 633 PetscFunctionBegin; 634 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 635 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 636 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 637 638 v = a->a; 639 xb = x; 640 641 for (i=0; i<mbs; i++) { 642 n = ai[1] - ai[0]; /* length of i_th block row of A */ 643 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 644 ib = aj + *ai; 645 jmin = 0; 646 if (*ib == i){ /* (diag of A)*x */ 647 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 648 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 649 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 650 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 651 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 652 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 653 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 654 v += 49; jmin++; 655 } 656 for (j=jmin; j<n; j++) { 657 /* (strict lower triangular part of A)*x */ 658 cval = ib[j]*7; 659 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 660 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 661 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 662 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 663 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 664 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 665 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 666 /* (strict upper triangular part of A)*x */ 667 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 668 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 669 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 670 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 671 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 672 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 673 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 674 v += 49; 675 } 676 xb +=7; ai++; 677 } 678 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 679 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 680 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 681 PetscFunctionReturn(0); 682 } 683 684 /* 685 This will not work with MatScalar == float because it calls the BLAS 686 */ 687 #undef __FUNCT__ 688 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 689 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 690 { 691 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 692 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 693 MatScalar *v; 694 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 695 int ncols,k; 696 697 PetscFunctionBegin; 698 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 699 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 700 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 701 702 aj = a->j; 703 v = a->a; 704 ii = a->i; 705 706 if (!a->mult_work) { 707 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 708 } 709 work = a->mult_work; 710 711 for (i=0; i<mbs; i++) { 712 n = ii[1] - ii[0]; ncols = n*bs; 713 workt = work; idx=aj+ii[0]; 714 715 /* upper triangular part */ 716 for (j=0; j<n; j++) { 717 xb = x_ptr + bs*(*idx++); 718 for (k=0; k<bs; k++) workt[k] = xb[k]; 719 workt += bs; 720 } 721 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 722 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 723 724 /* strict lower triangular part */ 725 idx = aj+ii[0]; 726 if (*idx == i){ 727 ncols -= bs; v += bs2; idx++; n--; 728 } 729 730 if (ncols > 0){ 731 workt = work; 732 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 733 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 734 for (j=0; j<n; j++) { 735 zb = z_ptr + bs*(*idx++); 736 for (k=0; k<bs; k++) zb[k] += workt[k] ; 737 workt += bs; 738 } 739 } 740 x += bs; v += n*bs2; z += bs; ii++; 741 } 742 743 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 744 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 745 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 746 PetscFunctionReturn(0); 747 } 748 749 #undef __FUNCT__ 750 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 751 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 752 { 753 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 754 PetscScalar *x,*y,*z,*xb,x1; 755 MatScalar *v; 756 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 757 758 PetscFunctionBegin; 759 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 760 if (yy != xx) { 761 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 762 } else { 763 y = x; 764 } 765 if (zz != yy) { 766 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 767 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 768 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 769 } else { 770 z = y; 771 } 772 773 v = a->a; 774 xb = x; 775 776 for (i=0; i<mbs; i++) { 777 n = ai[1] - ai[0]; /* length of i_th row of A */ 778 x1 = xb[0]; 779 ib = aj + *ai; 780 jmin = 0; 781 if (*ib == i) { /* (diag of A)*x */ 782 z[i] += *v++ * x[*ib++]; jmin++; 783 } 784 for (j=jmin; j<n; j++) { 785 cval = *ib; 786 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 787 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 788 } 789 xb++; ai++; 790 } 791 792 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 793 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 794 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 795 796 PetscLogFlops(2*(a->nz*2 - A->m)); 797 PetscFunctionReturn(0); 798 } 799 800 #undef __FUNCT__ 801 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 802 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 803 { 804 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 805 PetscScalar *x,*y,*z,*xb,x1,x2; 806 MatScalar *v; 807 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 808 809 PetscFunctionBegin; 810 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 811 if (yy != xx) { 812 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 813 } else { 814 y = x; 815 } 816 if (zz != yy) { 817 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 818 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 819 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 820 } else { 821 z = y; 822 } 823 824 v = a->a; 825 xb = x; 826 827 for (i=0; i<mbs; i++) { 828 n = ai[1] - ai[0]; /* length of i_th block row of A */ 829 x1 = xb[0]; x2 = xb[1]; 830 ib = aj + *ai; 831 jmin = 0; 832 if (*ib == i){ /* (diag of A)*x */ 833 z[2*i] += v[0]*x1 + v[2]*x2; 834 z[2*i+1] += v[2]*x1 + v[3]*x2; 835 v += 4; jmin++; 836 } 837 for (j=jmin; j<n; j++) { 838 /* (strict lower triangular part of A)*x */ 839 cval = ib[j]*2; 840 z[cval] += v[0]*x1 + v[1]*x2; 841 z[cval+1] += v[2]*x1 + v[3]*x2; 842 /* (strict upper triangular part of A)*x */ 843 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 844 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 845 v += 4; 846 } 847 xb +=2; ai++; 848 } 849 850 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 851 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 852 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 853 854 PetscLogFlops(4*(a->nz*2 - A->m)); 855 PetscFunctionReturn(0); 856 } 857 858 #undef __FUNCT__ 859 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 860 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 861 { 862 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 863 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 864 MatScalar *v; 865 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 866 867 PetscFunctionBegin; 868 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 869 if (yy != xx) { 870 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 871 } else { 872 y = x; 873 } 874 if (zz != yy) { 875 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 876 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 877 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 878 } else { 879 z = y; 880 } 881 882 v = a->a; 883 xb = x; 884 885 for (i=0; i<mbs; i++) { 886 n = ai[1] - ai[0]; /* length of i_th block row of A */ 887 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 888 ib = aj + *ai; 889 jmin = 0; 890 if (*ib == i){ /* (diag of A)*x */ 891 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 892 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 893 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 894 v += 9; jmin++; 895 } 896 for (j=jmin; j<n; j++) { 897 /* (strict lower triangular part of A)*x */ 898 cval = ib[j]*3; 899 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 900 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 901 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 902 /* (strict upper triangular part of A)*x */ 903 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 904 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 905 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 906 v += 9; 907 } 908 xb +=3; ai++; 909 } 910 911 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 912 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 913 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 914 915 PetscLogFlops(18*(a->nz*2 - A->m)); 916 PetscFunctionReturn(0); 917 } 918 919 #undef __FUNCT__ 920 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 921 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 922 { 923 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 924 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 925 MatScalar *v; 926 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 927 928 PetscFunctionBegin; 929 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 930 if (yy != xx) { 931 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 932 } else { 933 y = x; 934 } 935 if (zz != yy) { 936 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 937 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 938 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 939 } else { 940 z = y; 941 } 942 943 v = a->a; 944 xb = x; 945 946 for (i=0; i<mbs; i++) { 947 n = ai[1] - ai[0]; /* length of i_th block row of A */ 948 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 949 ib = aj + *ai; 950 jmin = 0; 951 if (*ib == i){ /* (diag of A)*x */ 952 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 953 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 954 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 955 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 956 v += 16; jmin++; 957 } 958 for (j=jmin; j<n; j++) { 959 /* (strict lower triangular part of A)*x */ 960 cval = ib[j]*4; 961 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 962 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 963 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 964 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 965 /* (strict upper triangular part of A)*x */ 966 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 967 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 968 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 969 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 970 v += 16; 971 } 972 xb +=4; ai++; 973 } 974 975 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 976 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 977 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 978 979 PetscLogFlops(32*(a->nz*2 - A->m)); 980 PetscFunctionReturn(0); 981 } 982 983 #undef __FUNCT__ 984 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 985 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 986 { 987 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 988 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 989 MatScalar *v; 990 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 991 992 PetscFunctionBegin; 993 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 994 if (yy != xx) { 995 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 996 } else { 997 y = x; 998 } 999 if (zz != yy) { 1000 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1001 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 1002 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1003 } else { 1004 z = y; 1005 } 1006 1007 v = a->a; 1008 xb = x; 1009 1010 for (i=0; i<mbs; i++) { 1011 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1012 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 1013 ib = aj + *ai; 1014 jmin = 0; 1015 if (*ib == i){ /* (diag of A)*x */ 1016 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 1017 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 1018 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 1019 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 1020 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 1021 v += 25; jmin++; 1022 } 1023 for (j=jmin; j<n; j++) { 1024 /* (strict lower triangular part of A)*x */ 1025 cval = ib[j]*5; 1026 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1027 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1028 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 1029 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 1030 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 1031 /* (strict upper triangular part of A)*x */ 1032 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 1033 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 1034 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 1035 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 1036 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 1037 v += 25; 1038 } 1039 xb +=5; ai++; 1040 } 1041 1042 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1043 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1044 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1045 1046 PetscLogFlops(50*(a->nz*2 - A->m)); 1047 PetscFunctionReturn(0); 1048 } 1049 #undef __FUNCT__ 1050 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 1051 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1052 { 1053 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1054 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 1055 MatScalar *v; 1056 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 1057 1058 PetscFunctionBegin; 1059 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1060 if (yy != xx) { 1061 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1062 } else { 1063 y = x; 1064 } 1065 if (zz != yy) { 1066 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1067 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 1068 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1069 } else { 1070 z = y; 1071 } 1072 1073 v = a->a; 1074 xb = x; 1075 1076 for (i=0; i<mbs; i++) { 1077 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1078 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1079 ib = aj + *ai; 1080 jmin = 0; 1081 if (*ib == i){ /* (diag of A)*x */ 1082 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1083 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1084 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1085 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1086 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1087 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1088 v += 36; jmin++; 1089 } 1090 for (j=jmin; j<n; j++) { 1091 /* (strict lower triangular part of A)*x */ 1092 cval = ib[j]*6; 1093 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1094 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1095 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1096 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1097 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1098 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1099 /* (strict upper triangular part of A)*x */ 1100 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1101 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1102 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1103 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1104 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1105 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1106 v += 36; 1107 } 1108 xb +=6; ai++; 1109 } 1110 1111 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1112 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1113 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1114 1115 PetscLogFlops(72*(a->nz*2 - A->m)); 1116 PetscFunctionReturn(0); 1117 } 1118 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1121 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1122 { 1123 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1124 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1125 MatScalar *v; 1126 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 1127 1128 PetscFunctionBegin; 1129 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1130 if (yy != xx) { 1131 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1132 } else { 1133 y = x; 1134 } 1135 if (zz != yy) { 1136 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1137 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 1138 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1139 } else { 1140 z = y; 1141 } 1142 1143 v = a->a; 1144 xb = x; 1145 1146 for (i=0; i<mbs; i++) { 1147 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1148 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1149 ib = aj + *ai; 1150 jmin = 0; 1151 if (*ib == i){ /* (diag of A)*x */ 1152 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1153 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1154 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1155 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1156 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1157 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1158 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1159 v += 49; jmin++; 1160 } 1161 for (j=jmin; j<n; j++) { 1162 /* (strict lower triangular part of A)*x */ 1163 cval = ib[j]*7; 1164 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1165 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1166 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1167 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1168 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1169 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1170 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1171 /* (strict upper triangular part of A)*x */ 1172 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1173 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1174 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1175 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1176 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1177 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1178 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1179 v += 49; 1180 } 1181 xb +=7; ai++; 1182 } 1183 1184 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1185 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1186 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1187 1188 PetscLogFlops(98*(a->nz*2 - A->m)); 1189 PetscFunctionReturn(0); 1190 } 1191 1192 #undef __FUNCT__ 1193 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1194 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1195 { 1196 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1197 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1198 MatScalar *v; 1199 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1200 int ncols,k; 1201 1202 PetscFunctionBegin; 1203 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 1204 if (yy != xx) { 1205 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1206 } else { 1207 y = x; 1208 } 1209 if (zz != yy) { 1210 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1211 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 1212 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1213 } else { 1214 z = y; 1215 } 1216 1217 aj = a->j; 1218 v = a->a; 1219 ii = a->i; 1220 1221 if (!a->mult_work) { 1222 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1223 } 1224 work = a->mult_work; 1225 1226 1227 for (i=0; i<mbs; i++) { 1228 n = ii[1] - ii[0]; ncols = n*bs; 1229 workt = work; idx=aj+ii[0]; 1230 1231 /* upper triangular part */ 1232 for (j=0; j<n; j++) { 1233 xb = x_ptr + bs*(*idx++); 1234 for (k=0; k<bs; k++) workt[k] = xb[k]; 1235 workt += bs; 1236 } 1237 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1238 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1239 1240 /* strict lower triangular part */ 1241 idx = aj+ii[0]; 1242 if (*idx == i){ 1243 ncols -= bs; v += bs2; idx++; n--; 1244 } 1245 if (ncols > 0){ 1246 workt = work; 1247 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1248 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1249 for (j=0; j<n; j++) { 1250 zb = z_ptr + bs*(*idx++); 1251 /* idx++; */ 1252 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1253 workt += bs; 1254 } 1255 } 1256 1257 x += bs; v += n*bs2; z += bs; ii++; 1258 } 1259 1260 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1261 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1262 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1263 1264 PetscLogFlops(2*(a->nz*2 - A->m)); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #undef __FUNCT__ 1269 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1270 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1271 { 1272 int ierr; 1273 1274 PetscFunctionBegin; 1275 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1276 PetscFunctionReturn(0); 1277 } 1278 1279 #undef __FUNCT__ 1280 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1281 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1282 1283 { 1284 int ierr; 1285 1286 PetscFunctionBegin; 1287 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1288 PetscFunctionReturn(0); 1289 } 1290 1291 #undef __FUNCT__ 1292 #define __FUNCT__ "MatScale_SeqSBAIJ" 1293 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1294 { 1295 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1296 int one = 1,totalnz = a->bs2*a->nz; 1297 1298 PetscFunctionBegin; 1299 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1300 PetscLogFlops(totalnz); 1301 PetscFunctionReturn(0); 1302 } 1303 1304 #undef __FUNCT__ 1305 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1306 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1307 { 1308 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1309 MatScalar *v = a->a; 1310 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1311 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1312 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1313 1314 PetscFunctionBegin; 1315 if (type == NORM_FROBENIUS) { 1316 for (k=0; k<mbs; k++){ 1317 jmin = a->i[k]; jmax = a->i[k+1]; 1318 col = aj + jmin; 1319 if (*col == k){ /* diagonal block */ 1320 for (i=0; i<bs2; i++){ 1321 #if defined(PETSC_USE_COMPLEX) 1322 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1323 #else 1324 sum_diag += (*v)*(*v); v++; 1325 #endif 1326 } 1327 jmin++; 1328 } 1329 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1330 for (i=0; i<bs2; i++){ 1331 #if defined(PETSC_USE_COMPLEX) 1332 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1333 #else 1334 sum_off += (*v)*(*v); v++; 1335 #endif 1336 } 1337 } 1338 } 1339 *norm = sqrt(sum_diag + 2*sum_off); 1340 1341 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1342 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1343 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1344 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1345 for (i=0; i<mbs; i++) { 1346 jl[i] = mbs; il[0] = 0; 1347 } 1348 1349 *norm = 0.0; 1350 for (k=0; k<mbs; k++) { /* k_th block row */ 1351 for (j=0; j<bs; j++) sum[j]=0.0; 1352 1353 /*-- col sum --*/ 1354 i = jl[k]; /* first |A(i,k)| to be added */ 1355 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1356 at step k */ 1357 while (i<mbs){ 1358 nexti = jl[i]; /* next block row to be added */ 1359 ik = il[i]; /* block index of A(i,k) in the array a */ 1360 for (j=0; j<bs; j++){ 1361 v = a->a + ik*bs2 + j*bs; 1362 for (k1=0; k1<bs; k1++) { 1363 sum[j] += PetscAbsScalar(*v); v++; 1364 } 1365 } 1366 /* update il, jl */ 1367 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1368 jmax = a->i[i+1]; 1369 if (jmin < jmax){ 1370 il[i] = jmin; 1371 j = a->j[jmin]; 1372 jl[i] = jl[j]; jl[j]=i; 1373 } 1374 i = nexti; 1375 } 1376 1377 /*-- row sum --*/ 1378 jmin = a->i[k]; jmax = a->i[k+1]; 1379 for (i=jmin; i<jmax; i++) { 1380 for (j=0; j<bs; j++){ 1381 v = a->a + i*bs2 + j; 1382 for (k1=0; k1<bs; k1++){ 1383 sum[j] += PetscAbsScalar(*v); 1384 v += bs; 1385 } 1386 } 1387 } 1388 /* add k_th block row to il, jl */ 1389 col = aj+jmin; 1390 if (*col == k) jmin++; 1391 if (jmin < jmax){ 1392 il[k] = jmin; 1393 j = a->j[jmin]; 1394 jl[k] = jl[j]; jl[j] = k; 1395 } 1396 for (j=0; j<bs; j++){ 1397 if (sum[j] > *norm) *norm = sum[j]; 1398 } 1399 } 1400 ierr = PetscFree(il);CHKERRQ(ierr); 1401 ierr = PetscFree(jl);CHKERRQ(ierr); 1402 ierr = PetscFree(sum);CHKERRQ(ierr); 1403 } else { 1404 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1405 } 1406 PetscFunctionReturn(0); 1407 } 1408 1409 #undef __FUNCT__ 1410 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1411 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1412 { 1413 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1414 int ierr; 1415 1416 PetscFunctionBegin; 1417 1418 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1419 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1420 *flg = PETSC_FALSE; 1421 PetscFunctionReturn(0); 1422 } 1423 1424 /* if the a->i are the same */ 1425 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1426 if (*flg == PETSC_FALSE) { 1427 PetscFunctionReturn(0); 1428 } 1429 1430 /* if a->j are the same */ 1431 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1432 if (*flg == PETSC_FALSE) { 1433 PetscFunctionReturn(0); 1434 } 1435 /* if a->a are the same */ 1436 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1437 1438 PetscFunctionReturn(0); 1439 } 1440 1441 #undef __FUNCT__ 1442 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1443 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1444 { 1445 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1446 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1447 PetscScalar *x,zero = 0.0; 1448 MatScalar *aa,*aa_j; 1449 1450 PetscFunctionBegin; 1451 bs = a->bs; 1452 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1453 1454 aa = a->a; 1455 ai = a->i; 1456 aj = a->j; 1457 ambs = a->mbs; 1458 bs2 = a->bs2; 1459 1460 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1461 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1462 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1463 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1464 for (i=0; i<ambs; i++) { 1465 j=ai[i]; 1466 if (aj[j] == i) { /* if this is a diagonal element */ 1467 row = i*bs; 1468 aa_j = aa + j*bs2; 1469 if (A->factor && bs==1){ 1470 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1471 } else { 1472 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1473 } 1474 } 1475 } 1476 1477 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1478 PetscFunctionReturn(0); 1479 } 1480 1481 #undef __FUNCT__ 1482 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1483 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1484 { 1485 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1486 PetscScalar *l,*r,x,*li,*ri; 1487 MatScalar *aa,*v; 1488 int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1489 1490 PetscFunctionBegin; 1491 ai = a->i; 1492 aj = a->j; 1493 aa = a->a; 1494 m = A->m; 1495 bs = a->bs; 1496 mbs = a->mbs; 1497 bs2 = a->bs2; 1498 1499 if (ll != rr) { 1500 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1501 } 1502 if (ll) { 1503 ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr); 1504 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1505 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1506 for (i=0; i<mbs; i++) { /* for each block row */ 1507 M = ai[i+1] - ai[i]; 1508 li = l + i*bs; 1509 v = aa + bs2*ai[i]; 1510 for (j=0; j<M; j++) { /* for each block */ 1511 for (k=0; k<bs2; k++) { 1512 (*v++) *= li[k%bs]; 1513 } 1514 #ifdef CONT 1515 /* will be used to replace the above loop */ 1516 ri = l + bs*aj[ai[i]+j]; 1517 for (k=0; k<bs; k++) { /* column value */ 1518 x = ri[k]; 1519 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1520 } 1521 #endif 1522 1523 } 1524 } 1525 ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr); 1526 PetscLogFlops(2*a->nz); 1527 } 1528 /* will be deleted */ 1529 if (rr) { 1530 ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr); 1531 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1532 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1533 for (i=0; i<mbs; i++) { /* for each block row */ 1534 M = ai[i+1] - ai[i]; 1535 v = aa + bs2*ai[i]; 1536 for (j=0; j<M; j++) { /* for each block */ 1537 ri = r + bs*aj[ai[i]+j]; 1538 for (k=0; k<bs; k++) { 1539 x = ri[k]; 1540 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1541 } 1542 } 1543 } 1544 ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr); 1545 PetscLogFlops(a->nz); 1546 } 1547 PetscFunctionReturn(0); 1548 } 1549 1550 #undef __FUNCT__ 1551 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1552 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1553 { 1554 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1555 1556 PetscFunctionBegin; 1557 info->rows_global = (double)A->m; 1558 info->columns_global = (double)A->m; 1559 info->rows_local = (double)A->m; 1560 info->columns_local = (double)A->m; 1561 info->block_size = a->bs2; 1562 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1563 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1564 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1565 info->assemblies = A->num_ass; 1566 info->mallocs = a->reallocs; 1567 info->memory = A->mem; 1568 if (A->factor) { 1569 info->fill_ratio_given = A->info.fill_ratio_given; 1570 info->fill_ratio_needed = A->info.fill_ratio_needed; 1571 info->factor_mallocs = A->info.factor_mallocs; 1572 } else { 1573 info->fill_ratio_given = 0; 1574 info->fill_ratio_needed = 0; 1575 info->factor_mallocs = 0; 1576 } 1577 PetscFunctionReturn(0); 1578 } 1579 1580 1581 #undef __FUNCT__ 1582 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1583 int MatZeroEntries_SeqSBAIJ(Mat A) 1584 { 1585 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1586 int ierr; 1587 1588 PetscFunctionBegin; 1589 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 #undef __FUNCT__ 1594 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1595 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1596 { 1597 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1598 int ierr,i,j,n,row,col,bs,*ai,*aj,mbs; 1599 PetscReal atmp; 1600 MatScalar *aa; 1601 PetscScalar zero = 0.0,*x; 1602 int ncols,brow,bcol,krow,kcol; 1603 1604 PetscFunctionBegin; 1605 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1606 bs = a->bs; 1607 aa = a->a; 1608 ai = a->i; 1609 aj = a->j; 1610 mbs = a->mbs; 1611 1612 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1613 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1614 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1615 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1616 for (i=0; i<mbs; i++) { 1617 ncols = ai[1] - ai[0]; ai++; 1618 brow = bs*i; 1619 for (j=0; j<ncols; j++){ 1620 bcol = bs*(*aj); 1621 for (kcol=0; kcol<bs; kcol++){ 1622 col = bcol + kcol; /* col index */ 1623 for (krow=0; krow<bs; krow++){ 1624 atmp = PetscAbsScalar(*aa); aa++; 1625 row = brow + krow; /* row index */ 1626 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1627 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1628 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1629 } 1630 } 1631 aj++; 1632 } 1633 } 1634 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1635 PetscFunctionReturn(0); 1636 } 1637