1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 /* MatIncreaseOverlap_SeqSBAIJ() is vertually same as MatIncreaseOverlap_SeqBAIJ() 10 except in the line 1: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 11 and is slightly polished. 12 Should the two function be combined into a single piece of code? */ 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 15 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 16 { 17 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 18 int brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol; 19 int start,end,*ai,*aj,bs,*nidx2; 20 PetscBT table; 21 22 PetscFunctionBegin; 23 mbs = a->mbs; 24 ai = a->i; 25 aj = a->j; 26 bs = a->bs; 27 28 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 30 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 31 ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 32 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 33 34 for (i=0; i<is_max; i++) { 35 /* Initialise the two local arrays */ 36 isz = 0; 37 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 38 39 /* Extract the indices, assume there can be duplicate entries */ 40 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 41 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 42 43 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 44 for (j=0; j<n ; ++j){ 45 bcol = idx[j]/bs; /* convert the indices into block indices */ 46 if (bcol >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 47 if(!PetscBTLookupSet(table,bcol)) { nidx[isz++] = bcol;} 48 } 49 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 50 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 51 52 k = 0; 53 for (j=0; j<ov; j++){ /* for each overlap*/ 54 n = isz; 55 for (; k<n ; k++){ /* do only those brows in nidx[k], which are not done yet */ 56 brow = nidx[k]; 57 start = ai[brow]; 58 end = ai[brow+1]; 59 for (l = start; l<end ; l++){ 60 bcol = aj[l]; 61 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 62 } 63 } 64 } 65 /* expand the Index Set */ 66 for (j=0; j<isz; j++) { 67 for (k=0; k<bs; k++) 68 nidx2[j*bs+k] = nidx[j]*bs+k; 69 } 70 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 71 } 72 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 73 ierr = PetscFree(nidx);CHKERRQ(ierr); 74 ierr = PetscFree(nidx2);CHKERRQ(ierr); 75 PetscFunctionReturn(0); 76 } 77 78 #undef __FUNCT__ 79 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 80 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 81 { 82 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 83 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 84 int row,mat_i,*mat_j,tcol,*mat_ilen; 85 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 86 int *aj = a->j,*ai = a->i; 87 MatScalar *mat_a; 88 Mat C; 89 PetscTruth flag; 90 91 PetscFunctionBegin; 92 93 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 94 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 95 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 96 97 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 98 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 99 100 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 101 ssmap = smap; 102 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 103 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 104 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 105 /* determine lens of each row */ 106 for (i=0; i<nrows; i++) { 107 kstart = ai[irow[i]]; 108 kend = kstart + a->ilen[irow[i]]; 109 lens[i] = 0; 110 for (k=kstart; k<kend; k++) { 111 if (ssmap[aj[k]]) { 112 lens[i]++; 113 } 114 } 115 } 116 /* Create and fill new matrix */ 117 if (scall == MAT_REUSE_MATRIX) { 118 c = (Mat_SeqSBAIJ *)((*B)->data); 119 120 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 121 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 122 if (flag == PETSC_FALSE) { 123 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 124 } 125 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 126 C = *B; 127 } else { 128 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 129 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 130 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 131 } 132 c = (Mat_SeqSBAIJ *)(C->data); 133 for (i=0; i<nrows; i++) { 134 row = irow[i]; 135 kstart = ai[row]; 136 kend = kstart + a->ilen[row]; 137 mat_i = c->i[i]; 138 mat_j = c->j + mat_i; 139 mat_a = c->a + mat_i*bs2; 140 mat_ilen = c->ilen + i; 141 for (k=kstart; k<kend; k++) { 142 if ((tcol=ssmap[a->j[k]])) { 143 *mat_j++ = tcol - 1; 144 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 145 mat_a += bs2; 146 (*mat_ilen)++; 147 } 148 } 149 } 150 151 /* Free work space */ 152 ierr = PetscFree(smap);CHKERRQ(ierr); 153 ierr = PetscFree(lens);CHKERRQ(ierr); 154 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 157 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 158 *B = C; 159 PetscFunctionReturn(0); 160 } 161 162 #undef __FUNCT__ 163 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 164 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 165 { 166 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 167 IS is1; 168 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 169 170 PetscFunctionBegin; 171 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 172 173 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 174 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 175 176 /* Verify if the indices corespond to each element in a block 177 and form the IS with compressed IS */ 178 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 179 iary = vary + a->mbs; 180 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 181 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 182 183 count = 0; 184 for (i=0; i<a->mbs; i++) { 185 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 186 if (vary[i]==bs) iary[count++] = i; 187 } 188 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 189 190 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = PetscFree(vary);CHKERRQ(ierr); 192 193 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 194 ISDestroy(is1); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 200 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 201 { 202 int ierr,i; 203 204 PetscFunctionBegin; 205 if (scall == MAT_INITIAL_MATRIX) { 206 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 207 } 208 209 for (i=0; i<n; i++) { 210 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 211 } 212 PetscFunctionReturn(0); 213 } 214 215 /* -------------------------------------------------------*/ 216 /* Should check that shapes of vectors and matrices match */ 217 /* -------------------------------------------------------*/ 218 #include "petscblaslapack.h" 219 220 #undef __FUNCT__ 221 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 222 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 223 { 224 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 225 PetscScalar *x,*z,*xb,x1,zero=0.0; 226 MatScalar *v; 227 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 228 229 PetscFunctionBegin; 230 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 231 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 232 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 233 234 v = a->a; 235 xb = x; 236 237 for (i=0; i<mbs; i++) { 238 n = ai[1] - ai[0]; /* length of i_th row of A */ 239 x1 = xb[0]; 240 ib = aj + *ai; 241 jmin = 0; 242 if (*ib == i) { /* (diag of A)*x */ 243 z[i] += *v++ * x[*ib++]; 244 jmin++; 245 } 246 for (j=jmin; j<n; j++) { 247 cval = *ib; 248 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 249 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 250 } 251 xb++; ai++; 252 } 253 254 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 255 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 256 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 257 PetscFunctionReturn(0); 258 } 259 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 262 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 263 { 264 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 265 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 266 MatScalar *v; 267 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 268 269 270 PetscFunctionBegin; 271 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 272 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 273 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 274 275 v = a->a; 276 xb = x; 277 278 for (i=0; i<mbs; i++) { 279 n = ai[1] - ai[0]; /* length of i_th block row of A */ 280 x1 = xb[0]; x2 = xb[1]; 281 ib = aj + *ai; 282 jmin = 0; 283 if (*ib == i){ /* (diag of A)*x */ 284 z[2*i] += v[0]*x1 + v[2]*x2; 285 z[2*i+1] += v[2]*x1 + v[3]*x2; 286 v += 4; jmin++; 287 } 288 for (j=jmin; j<n; j++) { 289 /* (strict lower triangular part of A)*x */ 290 cval = ib[j]*2; 291 z[cval] += v[0]*x1 + v[1]*x2; 292 z[cval+1] += v[2]*x1 + v[3]*x2; 293 /* (strict upper triangular part of A)*x */ 294 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 295 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 296 v += 4; 297 } 298 xb +=2; ai++; 299 } 300 301 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 302 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 303 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 304 PetscFunctionReturn(0); 305 } 306 307 #undef __FUNCT__ 308 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 309 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 310 { 311 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 312 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 313 MatScalar *v; 314 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 315 316 317 PetscFunctionBegin; 318 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 319 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 320 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 321 322 v = a->a; 323 xb = x; 324 325 for (i=0; i<mbs; i++) { 326 n = ai[1] - ai[0]; /* length of i_th block row of A */ 327 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 328 ib = aj + *ai; 329 jmin = 0; 330 if (*ib == i){ /* (diag of A)*x */ 331 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 332 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 333 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 334 v += 9; jmin++; 335 } 336 for (j=jmin; j<n; j++) { 337 /* (strict lower triangular part of A)*x */ 338 cval = ib[j]*3; 339 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 340 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 341 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 342 /* (strict upper triangular part of A)*x */ 343 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 344 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 345 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 346 v += 9; 347 } 348 xb +=3; ai++; 349 } 350 351 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 352 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 353 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 354 PetscFunctionReturn(0); 355 } 356 357 #undef __FUNCT__ 358 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 359 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 360 { 361 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 362 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 363 MatScalar *v; 364 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 365 366 PetscFunctionBegin; 367 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 368 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 369 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 370 371 v = a->a; 372 xb = x; 373 374 for (i=0; i<mbs; i++) { 375 n = ai[1] - ai[0]; /* length of i_th block row of A */ 376 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 377 ib = aj + *ai; 378 jmin = 0; 379 if (*ib == i){ /* (diag of A)*x */ 380 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 381 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 382 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 383 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 384 v += 16; jmin++; 385 } 386 for (j=jmin; j<n; j++) { 387 /* (strict lower triangular part of A)*x */ 388 cval = ib[j]*4; 389 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 390 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 391 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 392 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 393 /* (strict upper triangular part of A)*x */ 394 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 395 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 396 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 397 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 398 v += 16; 399 } 400 xb +=4; ai++; 401 } 402 403 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 404 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 405 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 406 PetscFunctionReturn(0); 407 } 408 409 #undef __FUNCT__ 410 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 411 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 412 { 413 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 414 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 415 MatScalar *v; 416 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 417 418 PetscFunctionBegin; 419 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 420 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 421 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 422 423 v = a->a; 424 xb = x; 425 426 for (i=0; i<mbs; i++) { 427 n = ai[1] - ai[0]; /* length of i_th block row of A */ 428 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 429 ib = aj + *ai; 430 jmin = 0; 431 if (*ib == i){ /* (diag of A)*x */ 432 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 433 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 434 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 435 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 436 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 437 v += 25; jmin++; 438 } 439 for (j=jmin; j<n; j++) { 440 /* (strict lower triangular part of A)*x */ 441 cval = ib[j]*5; 442 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 443 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 444 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 445 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 446 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 447 /* (strict upper triangular part of A)*x */ 448 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 449 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 450 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 451 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 452 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 453 v += 25; 454 } 455 xb +=5; ai++; 456 } 457 458 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 459 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 460 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 461 PetscFunctionReturn(0); 462 } 463 464 465 #undef __FUNCT__ 466 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 467 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 468 { 469 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 470 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 471 MatScalar *v; 472 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 473 474 PetscFunctionBegin; 475 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 476 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 477 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 478 479 v = a->a; 480 xb = x; 481 482 for (i=0; i<mbs; i++) { 483 n = ai[1] - ai[0]; /* length of i_th block row of A */ 484 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 485 ib = aj + *ai; 486 jmin = 0; 487 if (*ib == i){ /* (diag of A)*x */ 488 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 489 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 490 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 491 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 492 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 493 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 494 v += 36; jmin++; 495 } 496 for (j=jmin; j<n; j++) { 497 /* (strict lower triangular part of A)*x */ 498 cval = ib[j]*6; 499 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 500 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 501 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 502 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 503 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 504 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 505 /* (strict upper triangular part of A)*x */ 506 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 507 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 508 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 509 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 510 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 511 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 512 v += 36; 513 } 514 xb +=6; ai++; 515 } 516 517 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 518 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 519 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 520 PetscFunctionReturn(0); 521 } 522 #undef __FUNCT__ 523 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 524 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 525 { 526 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 527 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 528 MatScalar *v; 529 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 530 531 PetscFunctionBegin; 532 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 533 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 534 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 535 536 v = a->a; 537 xb = x; 538 539 for (i=0; i<mbs; i++) { 540 n = ai[1] - ai[0]; /* length of i_th block row of A */ 541 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 542 ib = aj + *ai; 543 jmin = 0; 544 if (*ib == i){ /* (diag of A)*x */ 545 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 546 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 547 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 548 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 549 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 550 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 551 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 552 v += 49; jmin++; 553 } 554 for (j=jmin; j<n; j++) { 555 /* (strict lower triangular part of A)*x */ 556 cval = ib[j]*7; 557 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 558 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 559 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 560 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 561 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 562 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 563 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 564 /* (strict upper triangular part of A)*x */ 565 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 566 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 567 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 568 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 569 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 570 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 571 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 572 v += 49; 573 } 574 xb +=7; ai++; 575 } 576 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 577 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 578 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 579 PetscFunctionReturn(0); 580 } 581 582 /* 583 This will not work with MatScalar == float because it calls the BLAS 584 */ 585 #undef __FUNCT__ 586 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 587 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 588 { 589 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 590 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 591 MatScalar *v; 592 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 593 int ncols,k; 594 595 PetscFunctionBegin; 596 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 597 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 598 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 599 600 aj = a->j; 601 v = a->a; 602 ii = a->i; 603 604 if (!a->mult_work) { 605 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 606 } 607 work = a->mult_work; 608 609 for (i=0; i<mbs; i++) { 610 n = ii[1] - ii[0]; ncols = n*bs; 611 workt = work; idx=aj+ii[0]; 612 613 /* upper triangular part */ 614 for (j=0; j<n; j++) { 615 xb = x_ptr + bs*(*idx++); 616 for (k=0; k<bs; k++) workt[k] = xb[k]; 617 workt += bs; 618 } 619 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 620 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 621 622 /* strict lower triangular part */ 623 idx = aj+ii[0]; 624 if (*idx == i){ 625 ncols -= bs; v += bs2; idx++; n--; 626 } 627 628 if (ncols > 0){ 629 workt = work; 630 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 631 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 632 for (j=0; j<n; j++) { 633 zb = z_ptr + bs*(*idx++); 634 for (k=0; k<bs; k++) zb[k] += workt[k] ; 635 workt += bs; 636 } 637 } 638 x += bs; v += n*bs2; z += bs; ii++; 639 } 640 641 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 642 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 643 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 644 PetscFunctionReturn(0); 645 } 646 647 #undef __FUNCT__ 648 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 649 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 650 { 651 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 652 PetscScalar *x,*y,*z,*xb,x1; 653 MatScalar *v; 654 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 655 656 PetscFunctionBegin; 657 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 658 if (yy != xx) { 659 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 660 } else { 661 y = x; 662 } 663 if (zz != yy) { 664 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 665 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 666 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 667 } else { 668 z = y; 669 } 670 671 v = a->a; 672 xb = x; 673 674 for (i=0; i<mbs; i++) { 675 n = ai[1] - ai[0]; /* length of i_th row of A */ 676 x1 = xb[0]; 677 ib = aj + *ai; 678 jmin = 0; 679 if (*ib == i) { /* (diag of A)*x */ 680 z[i] += *v++ * x[*ib++]; jmin++; 681 } 682 for (j=jmin; j<n; j++) { 683 cval = *ib; 684 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 685 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 686 } 687 xb++; ai++; 688 } 689 690 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 691 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 692 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 693 694 PetscLogFlops(2*(a->nz*2 - A->m)); 695 PetscFunctionReturn(0); 696 } 697 698 #undef __FUNCT__ 699 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 700 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 701 { 702 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 703 PetscScalar *x,*y,*z,*xb,x1,x2; 704 MatScalar *v; 705 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 706 707 PetscFunctionBegin; 708 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 709 if (yy != xx) { 710 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 711 } else { 712 y = x; 713 } 714 if (zz != yy) { 715 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 716 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 717 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 718 } else { 719 z = y; 720 } 721 722 v = a->a; 723 xb = x; 724 725 for (i=0; i<mbs; i++) { 726 n = ai[1] - ai[0]; /* length of i_th block row of A */ 727 x1 = xb[0]; x2 = xb[1]; 728 ib = aj + *ai; 729 jmin = 0; 730 if (*ib == i){ /* (diag of A)*x */ 731 z[2*i] += v[0]*x1 + v[2]*x2; 732 z[2*i+1] += v[2]*x1 + v[3]*x2; 733 v += 4; jmin++; 734 } 735 for (j=jmin; j<n; j++) { 736 /* (strict lower triangular part of A)*x */ 737 cval = ib[j]*2; 738 z[cval] += v[0]*x1 + v[1]*x2; 739 z[cval+1] += v[2]*x1 + v[3]*x2; 740 /* (strict upper triangular part of A)*x */ 741 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 742 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 743 v += 4; 744 } 745 xb +=2; ai++; 746 } 747 748 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 749 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 750 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 751 752 PetscLogFlops(4*(a->nz*2 - A->m)); 753 PetscFunctionReturn(0); 754 } 755 756 #undef __FUNCT__ 757 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 758 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 759 { 760 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 761 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 762 MatScalar *v; 763 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 764 765 PetscFunctionBegin; 766 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 767 if (yy != xx) { 768 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 769 } else { 770 y = x; 771 } 772 if (zz != yy) { 773 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 774 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 775 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 776 } else { 777 z = y; 778 } 779 780 v = a->a; 781 xb = x; 782 783 for (i=0; i<mbs; i++) { 784 n = ai[1] - ai[0]; /* length of i_th block row of A */ 785 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 786 ib = aj + *ai; 787 jmin = 0; 788 if (*ib == i){ /* (diag of A)*x */ 789 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 790 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 791 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 792 v += 9; jmin++; 793 } 794 for (j=jmin; j<n; j++) { 795 /* (strict lower triangular part of A)*x */ 796 cval = ib[j]*3; 797 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 798 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 799 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 800 /* (strict upper triangular part of A)*x */ 801 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 802 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 803 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 804 v += 9; 805 } 806 xb +=3; ai++; 807 } 808 809 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 810 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 811 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 812 813 PetscLogFlops(18*(a->nz*2 - A->m)); 814 PetscFunctionReturn(0); 815 } 816 817 #undef __FUNCT__ 818 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 819 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 820 { 821 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 822 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 823 MatScalar *v; 824 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 825 826 PetscFunctionBegin; 827 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 828 if (yy != xx) { 829 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 830 } else { 831 y = x; 832 } 833 if (zz != yy) { 834 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 835 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 836 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 837 } else { 838 z = y; 839 } 840 841 v = a->a; 842 xb = x; 843 844 for (i=0; i<mbs; i++) { 845 n = ai[1] - ai[0]; /* length of i_th block row of A */ 846 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 847 ib = aj + *ai; 848 jmin = 0; 849 if (*ib == i){ /* (diag of A)*x */ 850 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 851 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 852 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 853 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 854 v += 16; jmin++; 855 } 856 for (j=jmin; j<n; j++) { 857 /* (strict lower triangular part of A)*x */ 858 cval = ib[j]*4; 859 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 860 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 861 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 862 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 863 /* (strict upper triangular part of A)*x */ 864 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 865 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 866 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 867 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 868 v += 16; 869 } 870 xb +=4; ai++; 871 } 872 873 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 874 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 875 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 876 877 PetscLogFlops(32*(a->nz*2 - A->m)); 878 PetscFunctionReturn(0); 879 } 880 881 #undef __FUNCT__ 882 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 883 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 884 { 885 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 886 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 887 MatScalar *v; 888 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 889 890 PetscFunctionBegin; 891 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 892 if (yy != xx) { 893 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 894 } else { 895 y = x; 896 } 897 if (zz != yy) { 898 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 899 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 900 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 901 } else { 902 z = y; 903 } 904 905 v = a->a; 906 xb = x; 907 908 for (i=0; i<mbs; i++) { 909 n = ai[1] - ai[0]; /* length of i_th block row of A */ 910 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 911 ib = aj + *ai; 912 jmin = 0; 913 if (*ib == i){ /* (diag of A)*x */ 914 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 915 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 916 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 917 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 918 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 919 v += 25; jmin++; 920 } 921 for (j=jmin; j<n; j++) { 922 /* (strict lower triangular part of A)*x */ 923 cval = ib[j]*5; 924 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 925 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 926 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 927 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 928 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 929 /* (strict upper triangular part of A)*x */ 930 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 931 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 932 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 933 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 934 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 935 v += 25; 936 } 937 xb +=5; ai++; 938 } 939 940 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 941 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 942 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 943 944 PetscLogFlops(50*(a->nz*2 - A->m)); 945 PetscFunctionReturn(0); 946 } 947 #undef __FUNCT__ 948 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 949 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 950 { 951 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 952 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 953 MatScalar *v; 954 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 955 956 PetscFunctionBegin; 957 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 958 if (yy != xx) { 959 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 960 } else { 961 y = x; 962 } 963 if (zz != yy) { 964 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 965 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 966 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 967 } else { 968 z = y; 969 } 970 971 v = a->a; 972 xb = x; 973 974 for (i=0; i<mbs; i++) { 975 n = ai[1] - ai[0]; /* length of i_th block row of A */ 976 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 977 ib = aj + *ai; 978 jmin = 0; 979 if (*ib == i){ /* (diag of A)*x */ 980 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 981 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 982 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 983 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 984 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 985 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 986 v += 36; jmin++; 987 } 988 for (j=jmin; j<n; j++) { 989 /* (strict lower triangular part of A)*x */ 990 cval = ib[j]*6; 991 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 992 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 993 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 994 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 995 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 996 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 997 /* (strict upper triangular part of A)*x */ 998 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 999 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1000 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1001 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1002 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1003 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1004 v += 36; 1005 } 1006 xb +=6; ai++; 1007 } 1008 1009 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1010 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1011 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1012 1013 PetscLogFlops(72*(a->nz*2 - A->m)); 1014 PetscFunctionReturn(0); 1015 } 1016 1017 #undef __FUNCT__ 1018 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1019 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1020 { 1021 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1022 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1023 MatScalar *v; 1024 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 1025 1026 PetscFunctionBegin; 1027 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 1028 if (yy != xx) { 1029 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1030 } else { 1031 y = x; 1032 } 1033 if (zz != yy) { 1034 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1035 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 1036 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1037 } else { 1038 z = y; 1039 } 1040 1041 v = a->a; 1042 xb = x; 1043 1044 for (i=0; i<mbs; i++) { 1045 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1046 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1047 ib = aj + *ai; 1048 jmin = 0; 1049 if (*ib == i){ /* (diag of A)*x */ 1050 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1051 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1052 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1053 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1054 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1055 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1056 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1057 v += 49; jmin++; 1058 } 1059 for (j=jmin; j<n; j++) { 1060 /* (strict lower triangular part of A)*x */ 1061 cval = ib[j]*7; 1062 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1063 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1064 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1065 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1066 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1067 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1068 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1069 /* (strict upper triangular part of A)*x */ 1070 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1071 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1072 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1073 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1074 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1075 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1076 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1077 v += 49; 1078 } 1079 xb +=7; ai++; 1080 } 1081 1082 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1083 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1084 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1085 1086 PetscLogFlops(98*(a->nz*2 - A->m)); 1087 PetscFunctionReturn(0); 1088 } 1089 1090 #undef __FUNCT__ 1091 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1092 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1093 { 1094 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1095 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1096 MatScalar *v; 1097 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1098 int ncols,k; 1099 1100 PetscFunctionBegin; 1101 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 1102 if (yy != xx) { 1103 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1104 } else { 1105 y = x; 1106 } 1107 if (zz != yy) { 1108 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1109 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 1110 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1111 } else { 1112 z = y; 1113 } 1114 1115 aj = a->j; 1116 v = a->a; 1117 ii = a->i; 1118 1119 if (!a->mult_work) { 1120 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1121 } 1122 work = a->mult_work; 1123 1124 1125 for (i=0; i<mbs; i++) { 1126 n = ii[1] - ii[0]; ncols = n*bs; 1127 workt = work; idx=aj+ii[0]; 1128 1129 /* upper triangular part */ 1130 for (j=0; j<n; j++) { 1131 xb = x_ptr + bs*(*idx++); 1132 for (k=0; k<bs; k++) workt[k] = xb[k]; 1133 workt += bs; 1134 } 1135 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1136 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1137 1138 /* strict lower triangular part */ 1139 idx = aj+ii[0]; 1140 if (*idx == i){ 1141 ncols -= bs; v += bs2; idx++; n--; 1142 } 1143 if (ncols > 0){ 1144 workt = work; 1145 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1146 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1147 for (j=0; j<n; j++) { 1148 zb = z_ptr + bs*(*idx++); 1149 /* idx++; */ 1150 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1151 workt += bs; 1152 } 1153 } 1154 1155 x += bs; v += n*bs2; z += bs; ii++; 1156 } 1157 1158 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1159 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1160 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1161 1162 PetscLogFlops(2*(a->nz*2 - A->m)); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1168 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1169 { 1170 int ierr; 1171 1172 PetscFunctionBegin; 1173 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 #undef __FUNCT__ 1178 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1179 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1180 1181 { 1182 int ierr; 1183 1184 PetscFunctionBegin; 1185 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1186 PetscFunctionReturn(0); 1187 } 1188 1189 #undef __FUNCT__ 1190 #define __FUNCT__ "MatScale_SeqSBAIJ" 1191 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1192 { 1193 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1194 int one = 1,totalnz = a->bs2*a->nz; 1195 1196 PetscFunctionBegin; 1197 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1198 PetscLogFlops(totalnz); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 #undef __FUNCT__ 1203 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1204 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1205 { 1206 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1207 MatScalar *v = a->a; 1208 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1209 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1210 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1211 1212 PetscFunctionBegin; 1213 if (type == NORM_FROBENIUS) { 1214 for (k=0; k<mbs; k++){ 1215 jmin = a->i[k]; jmax = a->i[k+1]; 1216 col = aj + jmin; 1217 if (*col == k){ /* diagonal block */ 1218 for (i=0; i<bs2; i++){ 1219 #if defined(PETSC_USE_COMPLEX) 1220 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1221 #else 1222 sum_diag += (*v)*(*v); v++; 1223 #endif 1224 } 1225 jmin++; 1226 } 1227 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1228 for (i=0; i<bs2; i++){ 1229 #if defined(PETSC_USE_COMPLEX) 1230 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1231 #else 1232 sum_off += (*v)*(*v); v++; 1233 #endif 1234 } 1235 } 1236 } 1237 *norm = sqrt(sum_diag + 2*sum_off); 1238 1239 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1240 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1241 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1242 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1243 for (i=0; i<mbs; i++) { 1244 jl[i] = mbs; il[0] = 0; 1245 } 1246 1247 *norm = 0.0; 1248 for (k=0; k<mbs; k++) { /* k_th block row */ 1249 for (j=0; j<bs; j++) sum[j]=0.0; 1250 1251 /*-- col sum --*/ 1252 i = jl[k]; /* first |A(i,k)| to be added */ 1253 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1254 at step k */ 1255 while (i<mbs){ 1256 nexti = jl[i]; /* next block row to be added */ 1257 ik = il[i]; /* block index of A(i,k) in the array a */ 1258 for (j=0; j<bs; j++){ 1259 v = a->a + ik*bs2 + j*bs; 1260 for (k1=0; k1<bs; k1++) { 1261 sum[j] += PetscAbsScalar(*v); v++; 1262 } 1263 } 1264 /* update il, jl */ 1265 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1266 jmax = a->i[i+1]; 1267 if (jmin < jmax){ 1268 il[i] = jmin; 1269 j = a->j[jmin]; 1270 jl[i] = jl[j]; jl[j]=i; 1271 } 1272 i = nexti; 1273 } 1274 1275 /*-- row sum --*/ 1276 jmin = a->i[k]; jmax = a->i[k+1]; 1277 for (i=jmin; i<jmax; i++) { 1278 for (j=0; j<bs; j++){ 1279 v = a->a + i*bs2 + j; 1280 for (k1=0; k1<bs; k1++){ 1281 sum[j] += PetscAbsScalar(*v); 1282 v += bs; 1283 } 1284 } 1285 } 1286 /* add k_th block row to il, jl */ 1287 col = aj+jmin; 1288 if (*col == k) jmin++; 1289 if (jmin < jmax){ 1290 il[k] = jmin; 1291 j = a->j[jmin]; 1292 jl[k] = jl[j]; jl[j] = k; 1293 } 1294 for (j=0; j<bs; j++){ 1295 if (sum[j] > *norm) *norm = sum[j]; 1296 } 1297 } 1298 ierr = PetscFree(il);CHKERRQ(ierr); 1299 ierr = PetscFree(jl);CHKERRQ(ierr); 1300 ierr = PetscFree(sum);CHKERRQ(ierr); 1301 } else { 1302 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1303 } 1304 PetscFunctionReturn(0); 1305 } 1306 1307 #undef __FUNCT__ 1308 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1309 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1310 { 1311 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1312 int ierr; 1313 1314 PetscFunctionBegin; 1315 1316 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1317 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1318 *flg = PETSC_FALSE; 1319 PetscFunctionReturn(0); 1320 } 1321 1322 /* if the a->i are the same */ 1323 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1324 if (*flg == PETSC_FALSE) { 1325 PetscFunctionReturn(0); 1326 } 1327 1328 /* if a->j are the same */ 1329 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1330 if (*flg == PETSC_FALSE) { 1331 PetscFunctionReturn(0); 1332 } 1333 /* if a->a are the same */ 1334 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1335 1336 PetscFunctionReturn(0); 1337 } 1338 1339 #undef __FUNCT__ 1340 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1341 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1342 { 1343 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1344 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1345 PetscScalar *x,zero = 0.0; 1346 MatScalar *aa,*aa_j; 1347 1348 PetscFunctionBegin; 1349 bs = a->bs; 1350 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1351 1352 aa = a->a; 1353 ai = a->i; 1354 aj = a->j; 1355 ambs = a->mbs; 1356 bs2 = a->bs2; 1357 1358 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1359 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1360 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1361 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1362 for (i=0; i<ambs; i++) { 1363 j=ai[i]; 1364 if (aj[j] == i) { /* if this is a diagonal element */ 1365 row = i*bs; 1366 aa_j = aa + j*bs2; 1367 if (A->factor && bs==1){ 1368 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1369 } else { 1370 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1371 } 1372 } 1373 } 1374 1375 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1376 PetscFunctionReturn(0); 1377 } 1378 1379 #undef __FUNCT__ 1380 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1381 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1382 { 1383 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1384 PetscScalar *l,*r,x,*li,*ri; 1385 MatScalar *aa,*v; 1386 int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1387 1388 PetscFunctionBegin; 1389 ai = a->i; 1390 aj = a->j; 1391 aa = a->a; 1392 m = A->m; 1393 bs = a->bs; 1394 mbs = a->mbs; 1395 bs2 = a->bs2; 1396 1397 if (ll != rr) { 1398 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1399 } 1400 if (ll) { 1401 ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr); 1402 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1403 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1404 for (i=0; i<mbs; i++) { /* for each block row */ 1405 M = ai[i+1] - ai[i]; 1406 li = l + i*bs; 1407 v = aa + bs2*ai[i]; 1408 for (j=0; j<M; j++) { /* for each block */ 1409 for (k=0; k<bs2; k++) { 1410 (*v++) *= li[k%bs]; 1411 } 1412 #ifdef CONT 1413 /* will be used to replace the above loop */ 1414 ri = l + bs*aj[ai[i]+j]; 1415 for (k=0; k<bs; k++) { /* column value */ 1416 x = ri[k]; 1417 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1418 } 1419 #endif 1420 1421 } 1422 } 1423 ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr); 1424 PetscLogFlops(2*a->nz); 1425 } 1426 /* will be deleted */ 1427 if (rr) { 1428 ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr); 1429 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1430 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1431 for (i=0; i<mbs; i++) { /* for each block row */ 1432 M = ai[i+1] - ai[i]; 1433 v = aa + bs2*ai[i]; 1434 for (j=0; j<M; j++) { /* for each block */ 1435 ri = r + bs*aj[ai[i]+j]; 1436 for (k=0; k<bs; k++) { 1437 x = ri[k]; 1438 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1439 } 1440 } 1441 } 1442 ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr); 1443 PetscLogFlops(a->nz); 1444 } 1445 PetscFunctionReturn(0); 1446 } 1447 1448 #undef __FUNCT__ 1449 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1450 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1451 { 1452 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1453 1454 PetscFunctionBegin; 1455 info->rows_global = (double)A->m; 1456 info->columns_global = (double)A->m; 1457 info->rows_local = (double)A->m; 1458 info->columns_local = (double)A->m; 1459 info->block_size = a->bs2; 1460 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1461 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1462 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1463 info->assemblies = A->num_ass; 1464 info->mallocs = a->reallocs; 1465 info->memory = A->mem; 1466 if (A->factor) { 1467 info->fill_ratio_given = A->info.fill_ratio_given; 1468 info->fill_ratio_needed = A->info.fill_ratio_needed; 1469 info->factor_mallocs = A->info.factor_mallocs; 1470 } else { 1471 info->fill_ratio_given = 0; 1472 info->fill_ratio_needed = 0; 1473 info->factor_mallocs = 0; 1474 } 1475 PetscFunctionReturn(0); 1476 } 1477 1478 1479 #undef __FUNCT__ 1480 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1481 int MatZeroEntries_SeqSBAIJ(Mat A) 1482 { 1483 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1484 int ierr; 1485 1486 PetscFunctionBegin; 1487 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1488 PetscFunctionReturn(0); 1489 } 1490 1491 #undef __FUNCT__ 1492 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1493 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1494 { 1495 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1496 int ierr,i,j,n,row,col,bs,*ai,*aj,mbs; 1497 PetscReal atmp; 1498 MatScalar *aa; 1499 PetscScalar zero = 0.0,*x; 1500 int ncols,brow,bcol,krow,kcol; 1501 1502 PetscFunctionBegin; 1503 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1504 bs = a->bs; 1505 aa = a->a; 1506 ai = a->i; 1507 aj = a->j; 1508 mbs = a->mbs; 1509 1510 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1511 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1512 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1513 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1514 for (i=0; i<mbs; i++) { 1515 ncols = ai[1] - ai[0]; ai++; 1516 brow = bs*i; 1517 for (j=0; j<ncols; j++){ 1518 bcol = bs*(*aj); 1519 for (kcol=0; kcol<bs; kcol++){ 1520 col = bcol + kcol; /* col index */ 1521 for (krow=0; krow<bs; krow++){ 1522 atmp = PetscAbsScalar(*aa); aa++; 1523 row = brow + krow; /* row index */ 1524 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1525 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1526 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1527 } 1528 } 1529 aj++; 1530 } 1531 } 1532 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1533 PetscFunctionReturn(0); 1534 } 1535