1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 int brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol, 15 start,end,*ai,*aj,bs,*nidx2,*colsearch; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 mbs = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = a->bs; 23 24 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 27 ierr = PetscMalloc((2*mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 28 colsearch = nidx + mbs + 1; 29 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { 32 33 /* Initialise the two local arrays */ 34 isz = 0; 35 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 36 37 /* Extract the indices, assume there can be duplicate entries */ 38 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 39 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 40 41 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 42 for (j=0; j<n ; ++j){ 43 bcol = idx[j]/bs; /* convert the indices into block indices */ 44 if (bcol >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 45 if(!PetscBTLookupSet(table,bcol)) { nidx[isz++] = bcol;} 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* initialize colsearch - 53 colsearch[i] points to the possible non-zero (i,active_col)-th entry in the array aj */ 54 for (brow=0; brow<mbs; brow++) colsearch[brow] = ai[brow]; 55 56 /* sort nidx[k:isz-1] - needed by col search */ 57 ierr = PetscSortInt(isz-k,nidx+k); 58 59 n = isz; 60 for (; k<n ; k++){ /* do only those brows in nidx[k], which are not done yet */ 61 brow = nidx[k]; 62 /* search block column */ 63 for (l=0; l<brow; l++){ 64 if (colsearch[l]<ai[l+1]){ 65 bcol = aj[colsearch[l]]; 66 while (bcol < brow ) { 67 colsearch[l]++; 68 if (colsearch[l]<ai[l+1]){ 69 bcol = aj[colsearch[l]]; 70 } else { 71 bcol = mbs; break; 72 } 73 } 74 if (bcol==brow){ 75 if (!PetscBTLookupSet(table,l)) {nidx[isz++] = l;} 76 colsearch[l]++; 77 } 78 } 79 } 80 /* search block row */ 81 start = ai[brow]; 82 end = ai[brow+1]; 83 for (l = start; l<end ; l++){ 84 bcol = aj[l]; 85 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 86 } 87 } 88 } /* for each overlap */ 89 90 /* expand the Index Set */ 91 for (j=0; j<isz; j++) { 92 for (k=0; k<bs; k++) 93 nidx2[j*bs+k] = nidx[j]*bs+k; 94 } 95 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 96 } 97 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 98 ierr = PetscFree(nidx);CHKERRQ(ierr); 99 ierr = PetscFree(nidx2);CHKERRQ(ierr); 100 PetscFunctionReturn(0); 101 } 102 103 #undef __FUNCT__ 104 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 105 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 106 { 107 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 108 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 109 int row,mat_i,*mat_j,tcol,*mat_ilen; 110 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 111 int *aj = a->j,*ai = a->i; 112 MatScalar *mat_a; 113 Mat C; 114 PetscTruth flag; 115 116 PetscFunctionBegin; 117 118 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 119 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 120 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 121 122 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 123 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 124 125 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 126 ssmap = smap; 127 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 128 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 129 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 130 /* determine lens of each row */ 131 for (i=0; i<nrows; i++) { 132 kstart = ai[irow[i]]; 133 kend = kstart + a->ilen[irow[i]]; 134 lens[i] = 0; 135 for (k=kstart; k<kend; k++) { 136 if (ssmap[aj[k]]) { 137 lens[i]++; 138 } 139 } 140 } 141 /* Create and fill new matrix */ 142 if (scall == MAT_REUSE_MATRIX) { 143 c = (Mat_SeqSBAIJ *)((*B)->data); 144 145 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 146 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 147 if (flag == PETSC_FALSE) { 148 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 149 } 150 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 151 C = *B; 152 } else { 153 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 154 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 155 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 156 } 157 c = (Mat_SeqSBAIJ *)(C->data); 158 for (i=0; i<nrows; i++) { 159 row = irow[i]; 160 kstart = ai[row]; 161 kend = kstart + a->ilen[row]; 162 mat_i = c->i[i]; 163 mat_j = c->j + mat_i; 164 mat_a = c->a + mat_i*bs2; 165 mat_ilen = c->ilen + i; 166 for (k=kstart; k<kend; k++) { 167 if ((tcol=ssmap[a->j[k]])) { 168 *mat_j++ = tcol - 1; 169 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 170 mat_a += bs2; 171 (*mat_ilen)++; 172 } 173 } 174 } 175 176 /* Free work space */ 177 ierr = PetscFree(smap);CHKERRQ(ierr); 178 ierr = PetscFree(lens);CHKERRQ(ierr); 179 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 180 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 181 182 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 183 *B = C; 184 PetscFunctionReturn(0); 185 } 186 187 #undef __FUNCT__ 188 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 189 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 190 { 191 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 192 IS is1; 193 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 194 195 PetscFunctionBegin; 196 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 197 198 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 199 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 200 201 /* Verify if the indices corespond to each element in a block 202 and form the IS with compressed IS */ 203 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 204 iary = vary + a->mbs; 205 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 206 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 207 208 count = 0; 209 for (i=0; i<a->mbs; i++) { 210 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 211 if (vary[i]==bs) iary[count++] = i; 212 } 213 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 214 215 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 216 ierr = PetscFree(vary);CHKERRQ(ierr); 217 218 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 219 ISDestroy(is1); 220 PetscFunctionReturn(0); 221 } 222 223 #undef __FUNCT__ 224 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 225 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 226 { 227 int ierr,i; 228 229 PetscFunctionBegin; 230 if (scall == MAT_INITIAL_MATRIX) { 231 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 232 } 233 234 for (i=0; i<n; i++) { 235 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 236 } 237 PetscFunctionReturn(0); 238 } 239 240 /* -------------------------------------------------------*/ 241 /* Should check that shapes of vectors and matrices match */ 242 /* -------------------------------------------------------*/ 243 #include "petscblaslapack.h" 244 245 #undef __FUNCT__ 246 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 247 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 248 { 249 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 250 PetscScalar *x,*z,*xb,x1,zero=0.0; 251 MatScalar *v; 252 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 253 254 PetscFunctionBegin; 255 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 256 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 257 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 258 259 v = a->a; 260 xb = x; 261 262 for (i=0; i<mbs; i++) { 263 n = ai[1] - ai[0]; /* length of i_th row of A */ 264 x1 = xb[0]; 265 ib = aj + *ai; 266 jmin = 0; 267 if (*ib == i) { /* (diag of A)*x */ 268 z[i] += *v++ * x[*ib++]; 269 jmin++; 270 } 271 for (j=jmin; j<n; j++) { 272 cval = *ib; 273 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 274 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 275 } 276 xb++; ai++; 277 } 278 279 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 280 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 281 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 282 PetscFunctionReturn(0); 283 } 284 285 #undef __FUNCT__ 286 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 287 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 290 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 291 MatScalar *v; 292 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 293 294 295 PetscFunctionBegin; 296 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 297 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 298 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 299 300 v = a->a; 301 xb = x; 302 303 for (i=0; i<mbs; i++) { 304 n = ai[1] - ai[0]; /* length of i_th block row of A */ 305 x1 = xb[0]; x2 = xb[1]; 306 ib = aj + *ai; 307 jmin = 0; 308 if (*ib == i){ /* (diag of A)*x */ 309 z[2*i] += v[0]*x1 + v[2]*x2; 310 z[2*i+1] += v[2]*x1 + v[3]*x2; 311 v += 4; jmin++; 312 } 313 for (j=jmin; j<n; j++) { 314 /* (strict lower triangular part of A)*x */ 315 cval = ib[j]*2; 316 z[cval] += v[0]*x1 + v[1]*x2; 317 z[cval+1] += v[2]*x1 + v[3]*x2; 318 /* (strict upper triangular part of A)*x */ 319 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 320 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 321 v += 4; 322 } 323 xb +=2; ai++; 324 } 325 326 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 327 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 328 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 329 PetscFunctionReturn(0); 330 } 331 332 #undef __FUNCT__ 333 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 334 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 335 { 336 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 337 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 338 MatScalar *v; 339 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 340 341 342 PetscFunctionBegin; 343 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 344 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 345 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 346 347 v = a->a; 348 xb = x; 349 350 for (i=0; i<mbs; i++) { 351 n = ai[1] - ai[0]; /* length of i_th block row of A */ 352 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 353 ib = aj + *ai; 354 jmin = 0; 355 if (*ib == i){ /* (diag of A)*x */ 356 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 357 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 358 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 359 v += 9; jmin++; 360 } 361 for (j=jmin; j<n; j++) { 362 /* (strict lower triangular part of A)*x */ 363 cval = ib[j]*3; 364 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 365 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 366 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 367 /* (strict upper triangular part of A)*x */ 368 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 369 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 370 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 371 v += 9; 372 } 373 xb +=3; ai++; 374 } 375 376 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 377 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 378 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 379 PetscFunctionReturn(0); 380 } 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 384 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 385 { 386 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 387 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 388 MatScalar *v; 389 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 390 391 PetscFunctionBegin; 392 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 393 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 394 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 395 396 v = a->a; 397 xb = x; 398 399 for (i=0; i<mbs; i++) { 400 n = ai[1] - ai[0]; /* length of i_th block row of A */ 401 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 402 ib = aj + *ai; 403 jmin = 0; 404 if (*ib == i){ /* (diag of A)*x */ 405 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 406 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 407 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 408 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 409 v += 16; jmin++; 410 } 411 for (j=jmin; j<n; j++) { 412 /* (strict lower triangular part of A)*x */ 413 cval = ib[j]*4; 414 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 415 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 416 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 417 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 418 /* (strict upper triangular part of A)*x */ 419 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 420 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 421 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 422 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 423 v += 16; 424 } 425 xb +=4; ai++; 426 } 427 428 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 429 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 430 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 431 PetscFunctionReturn(0); 432 } 433 434 #undef __FUNCT__ 435 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 436 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 437 { 438 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 439 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 440 MatScalar *v; 441 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 442 443 PetscFunctionBegin; 444 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 445 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 446 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 447 448 v = a->a; 449 xb = x; 450 451 for (i=0; i<mbs; i++) { 452 n = ai[1] - ai[0]; /* length of i_th block row of A */ 453 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 454 ib = aj + *ai; 455 jmin = 0; 456 if (*ib == i){ /* (diag of A)*x */ 457 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 458 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 459 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 460 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 461 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 462 v += 25; jmin++; 463 } 464 for (j=jmin; j<n; j++) { 465 /* (strict lower triangular part of A)*x */ 466 cval = ib[j]*5; 467 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 468 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 469 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 470 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 471 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 472 /* (strict upper triangular part of A)*x */ 473 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 474 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 475 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 476 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 477 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 478 v += 25; 479 } 480 xb +=5; ai++; 481 } 482 483 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 484 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 485 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 486 PetscFunctionReturn(0); 487 } 488 489 490 #undef __FUNCT__ 491 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 492 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 493 { 494 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 495 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 496 MatScalar *v; 497 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 498 499 PetscFunctionBegin; 500 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 501 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 502 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 503 504 v = a->a; 505 xb = x; 506 507 for (i=0; i<mbs; i++) { 508 n = ai[1] - ai[0]; /* length of i_th block row of A */ 509 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 510 ib = aj + *ai; 511 jmin = 0; 512 if (*ib == i){ /* (diag of A)*x */ 513 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 514 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 515 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 516 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 517 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 518 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 519 v += 36; jmin++; 520 } 521 for (j=jmin; j<n; j++) { 522 /* (strict lower triangular part of A)*x */ 523 cval = ib[j]*6; 524 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 525 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 526 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 527 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 528 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 529 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 530 /* (strict upper triangular part of A)*x */ 531 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 532 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 533 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 534 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 535 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 536 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 537 v += 36; 538 } 539 xb +=6; ai++; 540 } 541 542 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 543 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 544 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 545 PetscFunctionReturn(0); 546 } 547 #undef __FUNCT__ 548 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 549 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 550 { 551 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 552 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 553 MatScalar *v; 554 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 555 556 PetscFunctionBegin; 557 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 558 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 559 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 560 561 v = a->a; 562 xb = x; 563 564 for (i=0; i<mbs; i++) { 565 n = ai[1] - ai[0]; /* length of i_th block row of A */ 566 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 567 ib = aj + *ai; 568 jmin = 0; 569 if (*ib == i){ /* (diag of A)*x */ 570 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 571 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 572 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 573 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 574 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 575 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 576 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 577 v += 49; jmin++; 578 } 579 for (j=jmin; j<n; j++) { 580 /* (strict lower triangular part of A)*x */ 581 cval = ib[j]*7; 582 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 583 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 584 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 585 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 586 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 587 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 588 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 589 /* (strict upper triangular part of A)*x */ 590 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 591 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 592 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 593 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 594 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 595 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 596 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 597 v += 49; 598 } 599 xb +=7; ai++; 600 } 601 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 602 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 603 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 604 PetscFunctionReturn(0); 605 } 606 607 /* 608 This will not work with MatScalar == float because it calls the BLAS 609 */ 610 #undef __FUNCT__ 611 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 612 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 613 { 614 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 615 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 616 MatScalar *v; 617 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 618 int ncols,k; 619 620 PetscFunctionBegin; 621 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 622 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 623 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 624 625 aj = a->j; 626 v = a->a; 627 ii = a->i; 628 629 if (!a->mult_work) { 630 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 631 } 632 work = a->mult_work; 633 634 for (i=0; i<mbs; i++) { 635 n = ii[1] - ii[0]; ncols = n*bs; 636 workt = work; idx=aj+ii[0]; 637 638 /* upper triangular part */ 639 for (j=0; j<n; j++) { 640 xb = x_ptr + bs*(*idx++); 641 for (k=0; k<bs; k++) workt[k] = xb[k]; 642 workt += bs; 643 } 644 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 645 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 646 647 /* strict lower triangular part */ 648 idx = aj+ii[0]; 649 if (*idx == i){ 650 ncols -= bs; v += bs2; idx++; n--; 651 } 652 653 if (ncols > 0){ 654 workt = work; 655 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 656 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 657 for (j=0; j<n; j++) { 658 zb = z_ptr + bs*(*idx++); 659 for (k=0; k<bs; k++) zb[k] += workt[k] ; 660 workt += bs; 661 } 662 } 663 x += bs; v += n*bs2; z += bs; ii++; 664 } 665 666 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 667 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 668 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 669 PetscFunctionReturn(0); 670 } 671 672 #undef __FUNCT__ 673 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 674 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 675 { 676 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 677 PetscScalar *x,*y,*z,*xb,x1; 678 MatScalar *v; 679 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 680 681 PetscFunctionBegin; 682 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 683 if (yy != xx) { 684 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 685 } else { 686 y = x; 687 } 688 if (zz != yy) { 689 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 690 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 691 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 692 } else { 693 z = y; 694 } 695 696 v = a->a; 697 xb = x; 698 699 for (i=0; i<mbs; i++) { 700 n = ai[1] - ai[0]; /* length of i_th row of A */ 701 x1 = xb[0]; 702 ib = aj + *ai; 703 jmin = 0; 704 if (*ib == i) { /* (diag of A)*x */ 705 z[i] += *v++ * x[*ib++]; jmin++; 706 } 707 for (j=jmin; j<n; j++) { 708 cval = *ib; 709 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 710 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 711 } 712 xb++; ai++; 713 } 714 715 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 716 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 717 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 718 719 PetscLogFlops(2*(a->nz*2 - A->m)); 720 PetscFunctionReturn(0); 721 } 722 723 #undef __FUNCT__ 724 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 725 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 726 { 727 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 728 PetscScalar *x,*y,*z,*xb,x1,x2; 729 MatScalar *v; 730 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 731 732 PetscFunctionBegin; 733 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 734 if (yy != xx) { 735 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 736 } else { 737 y = x; 738 } 739 if (zz != yy) { 740 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 741 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 742 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 743 } else { 744 z = y; 745 } 746 747 v = a->a; 748 xb = x; 749 750 for (i=0; i<mbs; i++) { 751 n = ai[1] - ai[0]; /* length of i_th block row of A */ 752 x1 = xb[0]; x2 = xb[1]; 753 ib = aj + *ai; 754 jmin = 0; 755 if (*ib == i){ /* (diag of A)*x */ 756 z[2*i] += v[0]*x1 + v[2]*x2; 757 z[2*i+1] += v[2]*x1 + v[3]*x2; 758 v += 4; jmin++; 759 } 760 for (j=jmin; j<n; j++) { 761 /* (strict lower triangular part of A)*x */ 762 cval = ib[j]*2; 763 z[cval] += v[0]*x1 + v[1]*x2; 764 z[cval+1] += v[2]*x1 + v[3]*x2; 765 /* (strict upper triangular part of A)*x */ 766 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 767 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 768 v += 4; 769 } 770 xb +=2; ai++; 771 } 772 773 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 774 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 775 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 776 777 PetscLogFlops(4*(a->nz*2 - A->m)); 778 PetscFunctionReturn(0); 779 } 780 781 #undef __FUNCT__ 782 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 783 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 784 { 785 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 786 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 787 MatScalar *v; 788 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 789 790 PetscFunctionBegin; 791 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 792 if (yy != xx) { 793 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 794 } else { 795 y = x; 796 } 797 if (zz != yy) { 798 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 799 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 800 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 801 } else { 802 z = y; 803 } 804 805 v = a->a; 806 xb = x; 807 808 for (i=0; i<mbs; i++) { 809 n = ai[1] - ai[0]; /* length of i_th block row of A */ 810 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 811 ib = aj + *ai; 812 jmin = 0; 813 if (*ib == i){ /* (diag of A)*x */ 814 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 815 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 816 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 817 v += 9; jmin++; 818 } 819 for (j=jmin; j<n; j++) { 820 /* (strict lower triangular part of A)*x */ 821 cval = ib[j]*3; 822 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 823 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 824 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 825 /* (strict upper triangular part of A)*x */ 826 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 827 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 828 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 829 v += 9; 830 } 831 xb +=3; ai++; 832 } 833 834 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 835 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 836 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 837 838 PetscLogFlops(18*(a->nz*2 - A->m)); 839 PetscFunctionReturn(0); 840 } 841 842 #undef __FUNCT__ 843 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 844 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 845 { 846 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 847 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 848 MatScalar *v; 849 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 850 851 PetscFunctionBegin; 852 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 853 if (yy != xx) { 854 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 855 } else { 856 y = x; 857 } 858 if (zz != yy) { 859 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 860 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 861 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 862 } else { 863 z = y; 864 } 865 866 v = a->a; 867 xb = x; 868 869 for (i=0; i<mbs; i++) { 870 n = ai[1] - ai[0]; /* length of i_th block row of A */ 871 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 872 ib = aj + *ai; 873 jmin = 0; 874 if (*ib == i){ /* (diag of A)*x */ 875 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 876 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 877 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 878 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 879 v += 16; jmin++; 880 } 881 for (j=jmin; j<n; j++) { 882 /* (strict lower triangular part of A)*x */ 883 cval = ib[j]*4; 884 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 885 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 886 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 887 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 888 /* (strict upper triangular part of A)*x */ 889 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 890 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 891 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 892 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 893 v += 16; 894 } 895 xb +=4; ai++; 896 } 897 898 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 899 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 900 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 901 902 PetscLogFlops(32*(a->nz*2 - A->m)); 903 PetscFunctionReturn(0); 904 } 905 906 #undef __FUNCT__ 907 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 908 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 909 { 910 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 911 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 912 MatScalar *v; 913 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 914 915 PetscFunctionBegin; 916 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 917 if (yy != xx) { 918 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 919 } else { 920 y = x; 921 } 922 if (zz != yy) { 923 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 924 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 925 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 926 } else { 927 z = y; 928 } 929 930 v = a->a; 931 xb = x; 932 933 for (i=0; i<mbs; i++) { 934 n = ai[1] - ai[0]; /* length of i_th block row of A */ 935 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 936 ib = aj + *ai; 937 jmin = 0; 938 if (*ib == i){ /* (diag of A)*x */ 939 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 940 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 941 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 942 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 943 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 944 v += 25; jmin++; 945 } 946 for (j=jmin; j<n; j++) { 947 /* (strict lower triangular part of A)*x */ 948 cval = ib[j]*5; 949 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 950 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 951 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 952 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 953 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 954 /* (strict upper triangular part of A)*x */ 955 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 956 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 957 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 958 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 959 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 960 v += 25; 961 } 962 xb +=5; ai++; 963 } 964 965 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 966 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 967 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 968 969 PetscLogFlops(50*(a->nz*2 - A->m)); 970 PetscFunctionReturn(0); 971 } 972 #undef __FUNCT__ 973 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 974 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 975 { 976 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 977 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 978 MatScalar *v; 979 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 980 981 PetscFunctionBegin; 982 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 983 if (yy != xx) { 984 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 985 } else { 986 y = x; 987 } 988 if (zz != yy) { 989 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 990 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 991 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 992 } else { 993 z = y; 994 } 995 996 v = a->a; 997 xb = x; 998 999 for (i=0; i<mbs; i++) { 1000 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1001 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1002 ib = aj + *ai; 1003 jmin = 0; 1004 if (*ib == i){ /* (diag of A)*x */ 1005 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1006 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1007 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1008 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1009 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1010 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1011 v += 36; jmin++; 1012 } 1013 for (j=jmin; j<n; j++) { 1014 /* (strict lower triangular part of A)*x */ 1015 cval = ib[j]*6; 1016 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1017 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1018 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1019 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1020 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1021 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1022 /* (strict upper triangular part of A)*x */ 1023 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1024 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1025 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1026 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1027 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1028 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1029 v += 36; 1030 } 1031 xb +=6; ai++; 1032 } 1033 1034 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1035 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1036 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1037 1038 PetscLogFlops(72*(a->nz*2 - A->m)); 1039 PetscFunctionReturn(0); 1040 } 1041 1042 #undef __FUNCT__ 1043 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1044 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1045 { 1046 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1047 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1048 MatScalar *v; 1049 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 1050 1051 PetscFunctionBegin; 1052 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1053 if (yy != xx) { 1054 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1055 } else { 1056 y = x; 1057 } 1058 if (zz != yy) { 1059 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1060 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 1061 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1062 } else { 1063 z = y; 1064 } 1065 1066 v = a->a; 1067 xb = x; 1068 1069 for (i=0; i<mbs; i++) { 1070 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1071 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1072 ib = aj + *ai; 1073 jmin = 0; 1074 if (*ib == i){ /* (diag of A)*x */ 1075 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1076 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1077 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1078 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1079 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1080 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1081 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1082 v += 49; jmin++; 1083 } 1084 for (j=jmin; j<n; j++) { 1085 /* (strict lower triangular part of A)*x */ 1086 cval = ib[j]*7; 1087 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1088 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1089 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1090 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1091 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1092 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1093 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1094 /* (strict upper triangular part of A)*x */ 1095 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1096 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1097 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1098 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1099 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1100 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1101 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1102 v += 49; 1103 } 1104 xb +=7; ai++; 1105 } 1106 1107 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1108 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1109 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1110 1111 PetscLogFlops(98*(a->nz*2 - A->m)); 1112 PetscFunctionReturn(0); 1113 } 1114 1115 #undef __FUNCT__ 1116 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1117 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1118 { 1119 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1120 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1121 MatScalar *v; 1122 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1123 int ncols,k; 1124 1125 PetscFunctionBegin; 1126 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 1127 if (yy != xx) { 1128 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1129 } else { 1130 y = x; 1131 } 1132 if (zz != yy) { 1133 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1134 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 1135 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1136 } else { 1137 z = y; 1138 } 1139 1140 aj = a->j; 1141 v = a->a; 1142 ii = a->i; 1143 1144 if (!a->mult_work) { 1145 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1146 } 1147 work = a->mult_work; 1148 1149 1150 for (i=0; i<mbs; i++) { 1151 n = ii[1] - ii[0]; ncols = n*bs; 1152 workt = work; idx=aj+ii[0]; 1153 1154 /* upper triangular part */ 1155 for (j=0; j<n; j++) { 1156 xb = x_ptr + bs*(*idx++); 1157 for (k=0; k<bs; k++) workt[k] = xb[k]; 1158 workt += bs; 1159 } 1160 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1161 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1162 1163 /* strict lower triangular part */ 1164 idx = aj+ii[0]; 1165 if (*idx == i){ 1166 ncols -= bs; v += bs2; idx++; n--; 1167 } 1168 if (ncols > 0){ 1169 workt = work; 1170 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1171 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1172 for (j=0; j<n; j++) { 1173 zb = z_ptr + bs*(*idx++); 1174 /* idx++; */ 1175 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1176 workt += bs; 1177 } 1178 } 1179 1180 x += bs; v += n*bs2; z += bs; ii++; 1181 } 1182 1183 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1184 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1185 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1186 1187 PetscLogFlops(2*(a->nz*2 - A->m)); 1188 PetscFunctionReturn(0); 1189 } 1190 1191 #undef __FUNCT__ 1192 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1193 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1194 { 1195 int ierr; 1196 1197 PetscFunctionBegin; 1198 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 #undef __FUNCT__ 1203 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1204 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1205 1206 { 1207 int ierr; 1208 1209 PetscFunctionBegin; 1210 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1211 PetscFunctionReturn(0); 1212 } 1213 1214 #undef __FUNCT__ 1215 #define __FUNCT__ "MatScale_SeqSBAIJ" 1216 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1217 { 1218 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1219 int one = 1,totalnz = a->bs2*a->nz; 1220 1221 PetscFunctionBegin; 1222 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1223 PetscLogFlops(totalnz); 1224 PetscFunctionReturn(0); 1225 } 1226 1227 #undef __FUNCT__ 1228 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1229 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1230 { 1231 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1232 MatScalar *v = a->a; 1233 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1234 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1235 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1236 1237 PetscFunctionBegin; 1238 if (type == NORM_FROBENIUS) { 1239 for (k=0; k<mbs; k++){ 1240 jmin = a->i[k]; jmax = a->i[k+1]; 1241 col = aj + jmin; 1242 if (*col == k){ /* diagonal block */ 1243 for (i=0; i<bs2; i++){ 1244 #if defined(PETSC_USE_COMPLEX) 1245 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1246 #else 1247 sum_diag += (*v)*(*v); v++; 1248 #endif 1249 } 1250 jmin++; 1251 } 1252 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1253 for (i=0; i<bs2; i++){ 1254 #if defined(PETSC_USE_COMPLEX) 1255 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1256 #else 1257 sum_off += (*v)*(*v); v++; 1258 #endif 1259 } 1260 } 1261 } 1262 *norm = sqrt(sum_diag + 2*sum_off); 1263 1264 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1265 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1266 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1267 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1268 for (i=0; i<mbs; i++) { 1269 jl[i] = mbs; il[0] = 0; 1270 } 1271 1272 *norm = 0.0; 1273 for (k=0; k<mbs; k++) { /* k_th block row */ 1274 for (j=0; j<bs; j++) sum[j]=0.0; 1275 1276 /*-- col sum --*/ 1277 i = jl[k]; /* first |A(i,k)| to be added */ 1278 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1279 at step k */ 1280 while (i<mbs){ 1281 nexti = jl[i]; /* next block row to be added */ 1282 ik = il[i]; /* block index of A(i,k) in the array a */ 1283 for (j=0; j<bs; j++){ 1284 v = a->a + ik*bs2 + j*bs; 1285 for (k1=0; k1<bs; k1++) { 1286 sum[j] += PetscAbsScalar(*v); v++; 1287 } 1288 } 1289 /* update il, jl */ 1290 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1291 jmax = a->i[i+1]; 1292 if (jmin < jmax){ 1293 il[i] = jmin; 1294 j = a->j[jmin]; 1295 jl[i] = jl[j]; jl[j]=i; 1296 } 1297 i = nexti; 1298 } 1299 1300 /*-- row sum --*/ 1301 jmin = a->i[k]; jmax = a->i[k+1]; 1302 for (i=jmin; i<jmax; i++) { 1303 for (j=0; j<bs; j++){ 1304 v = a->a + i*bs2 + j; 1305 for (k1=0; k1<bs; k1++){ 1306 sum[j] += PetscAbsScalar(*v); 1307 v += bs; 1308 } 1309 } 1310 } 1311 /* add k_th block row to il, jl */ 1312 col = aj+jmin; 1313 if (*col == k) jmin++; 1314 if (jmin < jmax){ 1315 il[k] = jmin; 1316 j = a->j[jmin]; 1317 jl[k] = jl[j]; jl[j] = k; 1318 } 1319 for (j=0; j<bs; j++){ 1320 if (sum[j] > *norm) *norm = sum[j]; 1321 } 1322 } 1323 ierr = PetscFree(il);CHKERRQ(ierr); 1324 ierr = PetscFree(jl);CHKERRQ(ierr); 1325 ierr = PetscFree(sum);CHKERRQ(ierr); 1326 } else { 1327 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1328 } 1329 PetscFunctionReturn(0); 1330 } 1331 1332 #undef __FUNCT__ 1333 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1334 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1335 { 1336 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1337 int ierr; 1338 1339 PetscFunctionBegin; 1340 1341 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1342 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1343 *flg = PETSC_FALSE; 1344 PetscFunctionReturn(0); 1345 } 1346 1347 /* if the a->i are the same */ 1348 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1349 if (*flg == PETSC_FALSE) { 1350 PetscFunctionReturn(0); 1351 } 1352 1353 /* if a->j are the same */ 1354 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1355 if (*flg == PETSC_FALSE) { 1356 PetscFunctionReturn(0); 1357 } 1358 /* if a->a are the same */ 1359 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1360 1361 PetscFunctionReturn(0); 1362 } 1363 1364 #undef __FUNCT__ 1365 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1366 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1367 { 1368 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1369 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1370 PetscScalar *x,zero = 0.0; 1371 MatScalar *aa,*aa_j; 1372 1373 PetscFunctionBegin; 1374 bs = a->bs; 1375 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1376 1377 aa = a->a; 1378 ai = a->i; 1379 aj = a->j; 1380 ambs = a->mbs; 1381 bs2 = a->bs2; 1382 1383 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1384 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1385 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1386 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1387 for (i=0; i<ambs; i++) { 1388 j=ai[i]; 1389 if (aj[j] == i) { /* if this is a diagonal element */ 1390 row = i*bs; 1391 aa_j = aa + j*bs2; 1392 if (A->factor && bs==1){ 1393 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1394 } else { 1395 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1396 } 1397 } 1398 } 1399 1400 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1401 PetscFunctionReturn(0); 1402 } 1403 1404 #undef __FUNCT__ 1405 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1406 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1407 { 1408 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1409 PetscScalar *l,*r,x,*li,*ri; 1410 MatScalar *aa,*v; 1411 int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1412 1413 PetscFunctionBegin; 1414 ai = a->i; 1415 aj = a->j; 1416 aa = a->a; 1417 m = A->m; 1418 bs = a->bs; 1419 mbs = a->mbs; 1420 bs2 = a->bs2; 1421 1422 if (ll != rr) { 1423 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1424 } 1425 if (ll) { 1426 ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr); 1427 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1428 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1429 for (i=0; i<mbs; i++) { /* for each block row */ 1430 M = ai[i+1] - ai[i]; 1431 li = l + i*bs; 1432 v = aa + bs2*ai[i]; 1433 for (j=0; j<M; j++) { /* for each block */ 1434 for (k=0; k<bs2; k++) { 1435 (*v++) *= li[k%bs]; 1436 } 1437 #ifdef CONT 1438 /* will be used to replace the above loop */ 1439 ri = l + bs*aj[ai[i]+j]; 1440 for (k=0; k<bs; k++) { /* column value */ 1441 x = ri[k]; 1442 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1443 } 1444 #endif 1445 1446 } 1447 } 1448 ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr); 1449 PetscLogFlops(2*a->nz); 1450 } 1451 /* will be deleted */ 1452 if (rr) { 1453 ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr); 1454 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1455 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1456 for (i=0; i<mbs; i++) { /* for each block row */ 1457 M = ai[i+1] - ai[i]; 1458 v = aa + bs2*ai[i]; 1459 for (j=0; j<M; j++) { /* for each block */ 1460 ri = r + bs*aj[ai[i]+j]; 1461 for (k=0; k<bs; k++) { 1462 x = ri[k]; 1463 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1464 } 1465 } 1466 } 1467 ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr); 1468 PetscLogFlops(a->nz); 1469 } 1470 PetscFunctionReturn(0); 1471 } 1472 1473 #undef __FUNCT__ 1474 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1475 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1476 { 1477 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1478 1479 PetscFunctionBegin; 1480 info->rows_global = (double)A->m; 1481 info->columns_global = (double)A->m; 1482 info->rows_local = (double)A->m; 1483 info->columns_local = (double)A->m; 1484 info->block_size = a->bs2; 1485 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1486 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1487 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1488 info->assemblies = A->num_ass; 1489 info->mallocs = a->reallocs; 1490 info->memory = A->mem; 1491 if (A->factor) { 1492 info->fill_ratio_given = A->info.fill_ratio_given; 1493 info->fill_ratio_needed = A->info.fill_ratio_needed; 1494 info->factor_mallocs = A->info.factor_mallocs; 1495 } else { 1496 info->fill_ratio_given = 0; 1497 info->fill_ratio_needed = 0; 1498 info->factor_mallocs = 0; 1499 } 1500 PetscFunctionReturn(0); 1501 } 1502 1503 1504 #undef __FUNCT__ 1505 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1506 int MatZeroEntries_SeqSBAIJ(Mat A) 1507 { 1508 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1509 int ierr; 1510 1511 PetscFunctionBegin; 1512 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1513 PetscFunctionReturn(0); 1514 } 1515 1516 #undef __FUNCT__ 1517 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1518 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1519 { 1520 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1521 int ierr,i,j,n,row,col,bs,*ai,*aj,mbs; 1522 PetscReal atmp; 1523 MatScalar *aa; 1524 PetscScalar zero = 0.0,*x; 1525 int ncols,brow,bcol,krow,kcol; 1526 1527 PetscFunctionBegin; 1528 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1529 bs = a->bs; 1530 aa = a->a; 1531 ai = a->i; 1532 aj = a->j; 1533 mbs = a->mbs; 1534 1535 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1536 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1537 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1538 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1539 for (i=0; i<mbs; i++) { 1540 ncols = ai[1] - ai[0]; ai++; 1541 brow = bs*i; 1542 for (j=0; j<ncols; j++){ 1543 bcol = bs*(*aj); 1544 for (kcol=0; kcol<bs; kcol++){ 1545 col = bcol + kcol; /* col index */ 1546 for (krow=0; krow<bs; krow++){ 1547 atmp = PetscAbsScalar(*aa); aa++; 1548 row = brow + krow; /* row index */ 1549 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1550 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1551 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1552 } 1553 } 1554 aj++; 1555 } 1556 } 1557 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1558 PetscFunctionReturn(0); 1559 } 1560