1 2 #include "src/mat/impls/baij/seq/baij.h" 3 #include "src/inline/spops.h" 4 #include "src/inline/ilu.h" 5 #include "petscbt.h" 6 #include "src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 int brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,bcol_max, 14 start,end,*ai,*aj,bs,*nidx2; 15 PetscBT table; 16 PetscBT table0; 17 18 PetscFunctionBegin; 19 mbs = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = a->bs; 23 24 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 27 ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 29 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 30 31 for (i=0; i<is_max; i++) { /* for each is */ 32 isz = 0; 33 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 40 bcol_max = 0; 41 for (j=0; j<n ; ++j){ 42 brow = idx[j]/bs; /* convert the indices into block indices */ 43 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,brow)) { 45 nidx[isz++] = brow; 46 if (bcol_max < brow) bcol_max = brow; 47 } 48 } 49 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 50 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 51 52 k = 0; 53 for (j=0; j<ov; j++){ /* for each overlap */ 54 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 55 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 56 for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]); 57 58 n = isz; /* length of the updated is[i] */ 59 for (brow=0; brow<mbs; brow++){ 60 start = ai[brow]; end = ai[brow+1]; 61 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 62 for (l = start; l<end ; l++){ 63 bcol = aj[l]; 64 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 65 } 66 k++; 67 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 68 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 69 for (l = start; l<end ; l++){ 70 bcol = aj[l]; 71 if (bcol > bcol_max) break; 72 if (PetscBTLookup(table0,bcol)){ 73 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 74 break; /* for l = start; l<end ; l++) */ 75 } 76 } 77 } 78 } 79 } /* for each overlap */ 80 81 /* expand the Index Set */ 82 for (j=0; j<isz; j++) { 83 for (k=0; k<bs; k++) 84 nidx2[j*bs+k] = nidx[j]*bs+k; 85 } 86 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 87 } 88 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 89 ierr = PetscFree(nidx);CHKERRQ(ierr); 90 ierr = PetscFree(nidx2);CHKERRQ(ierr); 91 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 97 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 98 { 99 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 100 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 101 int row,mat_i,*mat_j,tcol,*mat_ilen; 102 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 103 int *aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscTruth flag; 107 108 PetscFunctionBegin; 109 110 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 111 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 112 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 113 114 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 115 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 116 117 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 120 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 121 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 122 /* determine lens of each row */ 123 for (i=0; i<nrows; i++) { 124 kstart = ai[irow[i]]; 125 kend = kstart + a->ilen[irow[i]]; 126 lens[i] = 0; 127 for (k=kstart; k<kend; k++) { 128 if (ssmap[aj[k]]) { 129 lens[i]++; 130 } 131 } 132 } 133 /* Create and fill new matrix */ 134 if (scall == MAT_REUSE_MATRIX) { 135 c = (Mat_SeqSBAIJ *)((*B)->data); 136 137 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 138 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 139 if (flag == PETSC_FALSE) { 140 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 141 } 142 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 143 C = *B; 144 } else { 145 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 146 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 147 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 148 } 149 c = (Mat_SeqSBAIJ *)(C->data); 150 for (i=0; i<nrows; i++) { 151 row = irow[i]; 152 kstart = ai[row]; 153 kend = kstart + a->ilen[row]; 154 mat_i = c->i[i]; 155 mat_j = c->j + mat_i; 156 mat_a = c->a + mat_i*bs2; 157 mat_ilen = c->ilen + i; 158 for (k=kstart; k<kend; k++) { 159 if ((tcol=ssmap[a->j[k]])) { 160 *mat_j++ = tcol - 1; 161 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 162 mat_a += bs2; 163 (*mat_ilen)++; 164 } 165 } 166 } 167 168 /* Free work space */ 169 ierr = PetscFree(smap);CHKERRQ(ierr); 170 ierr = PetscFree(lens);CHKERRQ(ierr); 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 174 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 175 *B = C; 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 181 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 182 { 183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 184 IS is1; 185 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 186 187 PetscFunctionBegin; 188 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 189 190 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 191 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 192 193 /* Verify if the indices corespond to each element in a block 194 and form the IS with compressed IS */ 195 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 196 iary = vary + a->mbs; 197 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 198 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 199 200 count = 0; 201 for (i=0; i<a->mbs; i++) { 202 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 203 if (vary[i]==bs) iary[count++] = i; 204 } 205 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 206 207 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 208 ierr = PetscFree(vary);CHKERRQ(ierr); 209 210 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 211 ISDestroy(is1); 212 PetscFunctionReturn(0); 213 } 214 215 #undef __FUNCT__ 216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 217 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 218 { 219 int ierr,i; 220 221 PetscFunctionBegin; 222 if (scall == MAT_INITIAL_MATRIX) { 223 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 224 } 225 226 for (i=0; i<n; i++) { 227 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 228 } 229 PetscFunctionReturn(0); 230 } 231 232 /* -------------------------------------------------------*/ 233 /* Should check that shapes of vectors and matrices match */ 234 /* -------------------------------------------------------*/ 235 #include "petscblaslapack.h" 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 239 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 240 { 241 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 242 PetscScalar *x,*z,*xb,x1,zero=0.0; 243 MatScalar *v; 244 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 245 246 PetscFunctionBegin; 247 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 248 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 250 251 v = a->a; 252 xb = x; 253 254 for (i=0; i<mbs; i++) { 255 n = ai[1] - ai[0]; /* length of i_th row of A */ 256 x1 = xb[0]; 257 ib = aj + *ai; 258 jmin = 0; 259 if (*ib == i) { /* (diag of A)*x */ 260 z[i] += *v++ * x[*ib++]; 261 jmin++; 262 } 263 for (j=jmin; j<n; j++) { 264 cval = *ib; 265 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 266 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 267 } 268 xb++; ai++; 269 } 270 271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 272 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 273 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 279 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 282 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 283 MatScalar *v; 284 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 285 286 287 PetscFunctionBegin; 288 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 289 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 290 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 291 292 v = a->a; 293 xb = x; 294 295 for (i=0; i<mbs; i++) { 296 n = ai[1] - ai[0]; /* length of i_th block row of A */ 297 x1 = xb[0]; x2 = xb[1]; 298 ib = aj + *ai; 299 jmin = 0; 300 if (*ib == i){ /* (diag of A)*x */ 301 z[2*i] += v[0]*x1 + v[2]*x2; 302 z[2*i+1] += v[2]*x1 + v[3]*x2; 303 v += 4; jmin++; 304 } 305 for (j=jmin; j<n; j++) { 306 /* (strict lower triangular part of A)*x */ 307 cval = ib[j]*2; 308 z[cval] += v[0]*x1 + v[1]*x2; 309 z[cval+1] += v[2]*x1 + v[3]*x2; 310 /* (strict upper triangular part of A)*x */ 311 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 312 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 313 v += 4; 314 } 315 xb +=2; ai++; 316 } 317 318 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 319 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 320 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 321 PetscFunctionReturn(0); 322 } 323 324 #undef __FUNCT__ 325 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 326 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 327 { 328 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 329 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 330 MatScalar *v; 331 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 332 333 334 PetscFunctionBegin; 335 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 336 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 337 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 338 339 v = a->a; 340 xb = x; 341 342 for (i=0; i<mbs; i++) { 343 n = ai[1] - ai[0]; /* length of i_th block row of A */ 344 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 345 ib = aj + *ai; 346 jmin = 0; 347 if (*ib == i){ /* (diag of A)*x */ 348 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 349 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 350 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 351 v += 9; jmin++; 352 } 353 for (j=jmin; j<n; j++) { 354 /* (strict lower triangular part of A)*x */ 355 cval = ib[j]*3; 356 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 357 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 358 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 359 /* (strict upper triangular part of A)*x */ 360 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 361 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 362 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 363 v += 9; 364 } 365 xb +=3; ai++; 366 } 367 368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 369 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 370 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 371 PetscFunctionReturn(0); 372 } 373 374 #undef __FUNCT__ 375 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 376 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 377 { 378 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 379 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 380 MatScalar *v; 381 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 382 383 PetscFunctionBegin; 384 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 385 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 386 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 387 388 v = a->a; 389 xb = x; 390 391 for (i=0; i<mbs; i++) { 392 n = ai[1] - ai[0]; /* length of i_th block row of A */ 393 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 394 ib = aj + *ai; 395 jmin = 0; 396 if (*ib == i){ /* (diag of A)*x */ 397 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 398 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 399 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 400 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 401 v += 16; jmin++; 402 } 403 for (j=jmin; j<n; j++) { 404 /* (strict lower triangular part of A)*x */ 405 cval = ib[j]*4; 406 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 407 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 408 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 409 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 410 /* (strict upper triangular part of A)*x */ 411 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 412 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 413 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 414 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 415 v += 16; 416 } 417 xb +=4; ai++; 418 } 419 420 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 421 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 422 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 423 PetscFunctionReturn(0); 424 } 425 426 #undef __FUNCT__ 427 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 428 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 429 { 430 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 431 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 432 MatScalar *v; 433 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 434 435 PetscFunctionBegin; 436 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 437 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 438 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 439 440 v = a->a; 441 xb = x; 442 443 for (i=0; i<mbs; i++) { 444 n = ai[1] - ai[0]; /* length of i_th block row of A */ 445 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 446 ib = aj + *ai; 447 jmin = 0; 448 if (*ib == i){ /* (diag of A)*x */ 449 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 450 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 451 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 452 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 453 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 454 v += 25; jmin++; 455 } 456 for (j=jmin; j<n; j++) { 457 /* (strict lower triangular part of A)*x */ 458 cval = ib[j]*5; 459 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 460 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 461 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 462 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 463 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 464 /* (strict upper triangular part of A)*x */ 465 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 466 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 467 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 468 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 469 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 470 v += 25; 471 } 472 xb +=5; ai++; 473 } 474 475 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 476 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 477 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 478 PetscFunctionReturn(0); 479 } 480 481 482 #undef __FUNCT__ 483 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 484 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 485 { 486 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 487 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 488 MatScalar *v; 489 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 490 491 PetscFunctionBegin; 492 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 493 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 494 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 495 496 v = a->a; 497 xb = x; 498 499 for (i=0; i<mbs; i++) { 500 n = ai[1] - ai[0]; /* length of i_th block row of A */ 501 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 502 ib = aj + *ai; 503 jmin = 0; 504 if (*ib == i){ /* (diag of A)*x */ 505 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 506 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 507 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 508 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 509 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 510 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 511 v += 36; jmin++; 512 } 513 for (j=jmin; j<n; j++) { 514 /* (strict lower triangular part of A)*x */ 515 cval = ib[j]*6; 516 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 517 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 518 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 519 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 520 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 521 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 522 /* (strict upper triangular part of A)*x */ 523 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 524 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 525 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 526 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 527 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 528 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 529 v += 36; 530 } 531 xb +=6; ai++; 532 } 533 534 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 535 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 536 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 537 PetscFunctionReturn(0); 538 } 539 #undef __FUNCT__ 540 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 541 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 542 { 543 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 544 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 545 MatScalar *v; 546 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 547 548 PetscFunctionBegin; 549 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 550 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 551 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 552 553 v = a->a; 554 xb = x; 555 556 for (i=0; i<mbs; i++) { 557 n = ai[1] - ai[0]; /* length of i_th block row of A */ 558 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 559 ib = aj + *ai; 560 jmin = 0; 561 if (*ib == i){ /* (diag of A)*x */ 562 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 563 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 564 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 565 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 566 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 567 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 568 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 569 v += 49; jmin++; 570 } 571 for (j=jmin; j<n; j++) { 572 /* (strict lower triangular part of A)*x */ 573 cval = ib[j]*7; 574 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 575 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 576 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 577 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 578 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 579 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 580 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 581 /* (strict upper triangular part of A)*x */ 582 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 583 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 584 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 585 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 586 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 587 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 588 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 589 v += 49; 590 } 591 xb +=7; ai++; 592 } 593 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 594 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 595 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 596 PetscFunctionReturn(0); 597 } 598 599 /* 600 This will not work with MatScalar == float because it calls the BLAS 601 */ 602 #undef __FUNCT__ 603 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 604 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 605 { 606 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 607 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 608 MatScalar *v; 609 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 610 int ncols,k; 611 612 PetscFunctionBegin; 613 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 614 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 615 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 616 617 aj = a->j; 618 v = a->a; 619 ii = a->i; 620 621 if (!a->mult_work) { 622 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 623 } 624 work = a->mult_work; 625 626 for (i=0; i<mbs; i++) { 627 n = ii[1] - ii[0]; ncols = n*bs; 628 workt = work; idx=aj+ii[0]; 629 630 /* upper triangular part */ 631 for (j=0; j<n; j++) { 632 xb = x_ptr + bs*(*idx++); 633 for (k=0; k<bs; k++) workt[k] = xb[k]; 634 workt += bs; 635 } 636 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 637 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 638 639 /* strict lower triangular part */ 640 idx = aj+ii[0]; 641 if (*idx == i){ 642 ncols -= bs; v += bs2; idx++; n--; 643 } 644 645 if (ncols > 0){ 646 workt = work; 647 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 648 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 649 for (j=0; j<n; j++) { 650 zb = z_ptr + bs*(*idx++); 651 for (k=0; k<bs; k++) zb[k] += workt[k] ; 652 workt += bs; 653 } 654 } 655 x += bs; v += n*bs2; z += bs; ii++; 656 } 657 658 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 659 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 660 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 661 PetscFunctionReturn(0); 662 } 663 664 #undef __FUNCT__ 665 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 666 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 667 { 668 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 669 PetscScalar *x,*y,*z,*xb,x1; 670 MatScalar *v; 671 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 672 673 PetscFunctionBegin; 674 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 675 if (yy != xx) { 676 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 677 } else { 678 y = x; 679 } 680 if (zz != yy) { 681 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 682 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 683 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 684 } else { 685 z = y; 686 } 687 688 v = a->a; 689 xb = x; 690 691 for (i=0; i<mbs; i++) { 692 n = ai[1] - ai[0]; /* length of i_th row of A */ 693 x1 = xb[0]; 694 ib = aj + *ai; 695 jmin = 0; 696 if (*ib == i) { /* (diag of A)*x */ 697 z[i] += *v++ * x[*ib++]; jmin++; 698 } 699 for (j=jmin; j<n; j++) { 700 cval = *ib; 701 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 702 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 703 } 704 xb++; ai++; 705 } 706 707 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 708 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 709 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 710 711 PetscLogFlops(2*(a->nz*2 - A->m)); 712 PetscFunctionReturn(0); 713 } 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 717 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 718 { 719 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 720 PetscScalar *x,*y,*z,*xb,x1,x2; 721 MatScalar *v; 722 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 723 724 PetscFunctionBegin; 725 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 726 if (yy != xx) { 727 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 728 } else { 729 y = x; 730 } 731 if (zz != yy) { 732 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 733 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 734 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 735 } else { 736 z = y; 737 } 738 739 v = a->a; 740 xb = x; 741 742 for (i=0; i<mbs; i++) { 743 n = ai[1] - ai[0]; /* length of i_th block row of A */ 744 x1 = xb[0]; x2 = xb[1]; 745 ib = aj + *ai; 746 jmin = 0; 747 if (*ib == i){ /* (diag of A)*x */ 748 z[2*i] += v[0]*x1 + v[2]*x2; 749 z[2*i+1] += v[2]*x1 + v[3]*x2; 750 v += 4; jmin++; 751 } 752 for (j=jmin; j<n; j++) { 753 /* (strict lower triangular part of A)*x */ 754 cval = ib[j]*2; 755 z[cval] += v[0]*x1 + v[1]*x2; 756 z[cval+1] += v[2]*x1 + v[3]*x2; 757 /* (strict upper triangular part of A)*x */ 758 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 759 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 760 v += 4; 761 } 762 xb +=2; ai++; 763 } 764 765 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 766 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 767 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 768 769 PetscLogFlops(4*(a->nz*2 - A->m)); 770 PetscFunctionReturn(0); 771 } 772 773 #undef __FUNCT__ 774 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 775 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 776 { 777 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 778 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 779 MatScalar *v; 780 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 781 782 PetscFunctionBegin; 783 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 784 if (yy != xx) { 785 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 786 } else { 787 y = x; 788 } 789 if (zz != yy) { 790 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 791 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 792 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 793 } else { 794 z = y; 795 } 796 797 v = a->a; 798 xb = x; 799 800 for (i=0; i<mbs; i++) { 801 n = ai[1] - ai[0]; /* length of i_th block row of A */ 802 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 803 ib = aj + *ai; 804 jmin = 0; 805 if (*ib == i){ /* (diag of A)*x */ 806 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 807 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 808 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 809 v += 9; jmin++; 810 } 811 for (j=jmin; j<n; j++) { 812 /* (strict lower triangular part of A)*x */ 813 cval = ib[j]*3; 814 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 815 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 816 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 817 /* (strict upper triangular part of A)*x */ 818 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 819 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 820 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 821 v += 9; 822 } 823 xb +=3; ai++; 824 } 825 826 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 827 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 828 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 829 830 PetscLogFlops(18*(a->nz*2 - A->m)); 831 PetscFunctionReturn(0); 832 } 833 834 #undef __FUNCT__ 835 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 836 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 837 { 838 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 839 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 840 MatScalar *v; 841 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 842 843 PetscFunctionBegin; 844 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 845 if (yy != xx) { 846 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 847 } else { 848 y = x; 849 } 850 if (zz != yy) { 851 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 852 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 853 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 854 } else { 855 z = y; 856 } 857 858 v = a->a; 859 xb = x; 860 861 for (i=0; i<mbs; i++) { 862 n = ai[1] - ai[0]; /* length of i_th block row of A */ 863 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 864 ib = aj + *ai; 865 jmin = 0; 866 if (*ib == i){ /* (diag of A)*x */ 867 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 868 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 869 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 870 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 871 v += 16; jmin++; 872 } 873 for (j=jmin; j<n; j++) { 874 /* (strict lower triangular part of A)*x */ 875 cval = ib[j]*4; 876 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 877 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 878 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 879 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 880 /* (strict upper triangular part of A)*x */ 881 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 882 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 883 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 884 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 885 v += 16; 886 } 887 xb +=4; ai++; 888 } 889 890 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 891 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 892 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 893 894 PetscLogFlops(32*(a->nz*2 - A->m)); 895 PetscFunctionReturn(0); 896 } 897 898 #undef __FUNCT__ 899 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 900 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 901 { 902 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 903 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 904 MatScalar *v; 905 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 906 907 PetscFunctionBegin; 908 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 909 if (yy != xx) { 910 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 911 } else { 912 y = x; 913 } 914 if (zz != yy) { 915 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 916 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 917 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 918 } else { 919 z = y; 920 } 921 922 v = a->a; 923 xb = x; 924 925 for (i=0; i<mbs; i++) { 926 n = ai[1] - ai[0]; /* length of i_th block row of A */ 927 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 928 ib = aj + *ai; 929 jmin = 0; 930 if (*ib == i){ /* (diag of A)*x */ 931 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 932 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 933 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 934 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 935 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 936 v += 25; jmin++; 937 } 938 for (j=jmin; j<n; j++) { 939 /* (strict lower triangular part of A)*x */ 940 cval = ib[j]*5; 941 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 942 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 943 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 944 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 945 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 946 /* (strict upper triangular part of A)*x */ 947 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 948 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 949 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 950 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 951 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 952 v += 25; 953 } 954 xb +=5; ai++; 955 } 956 957 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 958 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 959 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 960 961 PetscLogFlops(50*(a->nz*2 - A->m)); 962 PetscFunctionReturn(0); 963 } 964 #undef __FUNCT__ 965 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 966 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 967 { 968 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 969 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 970 MatScalar *v; 971 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 972 973 PetscFunctionBegin; 974 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 975 if (yy != xx) { 976 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 977 } else { 978 y = x; 979 } 980 if (zz != yy) { 981 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 982 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 983 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 984 } else { 985 z = y; 986 } 987 988 v = a->a; 989 xb = x; 990 991 for (i=0; i<mbs; i++) { 992 n = ai[1] - ai[0]; /* length of i_th block row of A */ 993 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 994 ib = aj + *ai; 995 jmin = 0; 996 if (*ib == i){ /* (diag of A)*x */ 997 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 998 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 999 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1000 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1001 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1002 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1003 v += 36; jmin++; 1004 } 1005 for (j=jmin; j<n; j++) { 1006 /* (strict lower triangular part of A)*x */ 1007 cval = ib[j]*6; 1008 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1009 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1010 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1011 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1012 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1013 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1014 /* (strict upper triangular part of A)*x */ 1015 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1016 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1017 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1018 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1019 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1020 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1021 v += 36; 1022 } 1023 xb +=6; ai++; 1024 } 1025 1026 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1027 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1028 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1029 1030 PetscLogFlops(72*(a->nz*2 - A->m)); 1031 PetscFunctionReturn(0); 1032 } 1033 1034 #undef __FUNCT__ 1035 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1036 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1037 { 1038 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1039 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1040 MatScalar *v; 1041 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 1042 1043 PetscFunctionBegin; 1044 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1045 if (yy != xx) { 1046 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1047 } else { 1048 y = x; 1049 } 1050 if (zz != yy) { 1051 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1052 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1053 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1054 } else { 1055 z = y; 1056 } 1057 1058 v = a->a; 1059 xb = x; 1060 1061 for (i=0; i<mbs; i++) { 1062 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1063 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1064 ib = aj + *ai; 1065 jmin = 0; 1066 if (*ib == i){ /* (diag of A)*x */ 1067 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1068 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1069 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1070 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1071 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1072 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1073 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1074 v += 49; jmin++; 1075 } 1076 for (j=jmin; j<n; j++) { 1077 /* (strict lower triangular part of A)*x */ 1078 cval = ib[j]*7; 1079 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1080 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1081 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1082 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1083 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1084 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1085 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1086 /* (strict upper triangular part of A)*x */ 1087 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1088 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1089 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1090 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1091 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1092 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1093 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1094 v += 49; 1095 } 1096 xb +=7; ai++; 1097 } 1098 1099 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1100 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1101 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1102 1103 PetscLogFlops(98*(a->nz*2 - A->m)); 1104 PetscFunctionReturn(0); 1105 } 1106 1107 #undef __FUNCT__ 1108 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1109 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1110 { 1111 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1112 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1113 MatScalar *v; 1114 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1115 int ncols,k; 1116 1117 PetscFunctionBegin; 1118 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1119 if (yy != xx) { 1120 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1121 } else { 1122 y = x; 1123 } 1124 if (zz != yy) { 1125 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1126 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1127 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1128 } else { 1129 z = y; 1130 } 1131 1132 aj = a->j; 1133 v = a->a; 1134 ii = a->i; 1135 1136 if (!a->mult_work) { 1137 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1138 } 1139 work = a->mult_work; 1140 1141 1142 for (i=0; i<mbs; i++) { 1143 n = ii[1] - ii[0]; ncols = n*bs; 1144 workt = work; idx=aj+ii[0]; 1145 1146 /* upper triangular part */ 1147 for (j=0; j<n; j++) { 1148 xb = x_ptr + bs*(*idx++); 1149 for (k=0; k<bs; k++) workt[k] = xb[k]; 1150 workt += bs; 1151 } 1152 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1153 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1154 1155 /* strict lower triangular part */ 1156 idx = aj+ii[0]; 1157 if (*idx == i){ 1158 ncols -= bs; v += bs2; idx++; n--; 1159 } 1160 if (ncols > 0){ 1161 workt = work; 1162 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1163 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1164 for (j=0; j<n; j++) { 1165 zb = z_ptr + bs*(*idx++); 1166 /* idx++; */ 1167 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1168 workt += bs; 1169 } 1170 } 1171 1172 x += bs; v += n*bs2; z += bs; ii++; 1173 } 1174 1175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1176 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1177 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1178 1179 PetscLogFlops(2*(a->nz*2 - A->m)); 1180 PetscFunctionReturn(0); 1181 } 1182 1183 #undef __FUNCT__ 1184 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1185 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1186 { 1187 int ierr; 1188 1189 PetscFunctionBegin; 1190 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1191 PetscFunctionReturn(0); 1192 } 1193 1194 #undef __FUNCT__ 1195 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1196 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1197 1198 { 1199 int ierr; 1200 1201 PetscFunctionBegin; 1202 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1203 PetscFunctionReturn(0); 1204 } 1205 1206 #undef __FUNCT__ 1207 #define __FUNCT__ "MatScale_SeqSBAIJ" 1208 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1209 { 1210 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1211 int one = 1,totalnz = a->bs2*a->nz; 1212 1213 PetscFunctionBegin; 1214 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1215 PetscLogFlops(totalnz); 1216 PetscFunctionReturn(0); 1217 } 1218 1219 #undef __FUNCT__ 1220 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1221 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1222 { 1223 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1224 MatScalar *v = a->a; 1225 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1226 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1227 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1228 1229 PetscFunctionBegin; 1230 if (type == NORM_FROBENIUS) { 1231 for (k=0; k<mbs; k++){ 1232 jmin = a->i[k]; jmax = a->i[k+1]; 1233 col = aj + jmin; 1234 if (*col == k){ /* diagonal block */ 1235 for (i=0; i<bs2; i++){ 1236 #if defined(PETSC_USE_COMPLEX) 1237 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1238 #else 1239 sum_diag += (*v)*(*v); v++; 1240 #endif 1241 } 1242 jmin++; 1243 } 1244 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1245 for (i=0; i<bs2; i++){ 1246 #if defined(PETSC_USE_COMPLEX) 1247 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1248 #else 1249 sum_off += (*v)*(*v); v++; 1250 #endif 1251 } 1252 } 1253 } 1254 *norm = sqrt(sum_diag + 2*sum_off); 1255 1256 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1257 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1258 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1259 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1260 for (i=0; i<mbs; i++) { 1261 jl[i] = mbs; il[0] = 0; 1262 } 1263 1264 *norm = 0.0; 1265 for (k=0; k<mbs; k++) { /* k_th block row */ 1266 for (j=0; j<bs; j++) sum[j]=0.0; 1267 1268 /*-- col sum --*/ 1269 i = jl[k]; /* first |A(i,k)| to be added */ 1270 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1271 at step k */ 1272 while (i<mbs){ 1273 nexti = jl[i]; /* next block row to be added */ 1274 ik = il[i]; /* block index of A(i,k) in the array a */ 1275 for (j=0; j<bs; j++){ 1276 v = a->a + ik*bs2 + j*bs; 1277 for (k1=0; k1<bs; k1++) { 1278 sum[j] += PetscAbsScalar(*v); v++; 1279 } 1280 } 1281 /* update il, jl */ 1282 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1283 jmax = a->i[i+1]; 1284 if (jmin < jmax){ 1285 il[i] = jmin; 1286 j = a->j[jmin]; 1287 jl[i] = jl[j]; jl[j]=i; 1288 } 1289 i = nexti; 1290 } 1291 1292 /*-- row sum --*/ 1293 jmin = a->i[k]; jmax = a->i[k+1]; 1294 for (i=jmin; i<jmax; i++) { 1295 for (j=0; j<bs; j++){ 1296 v = a->a + i*bs2 + j; 1297 for (k1=0; k1<bs; k1++){ 1298 sum[j] += PetscAbsScalar(*v); 1299 v += bs; 1300 } 1301 } 1302 } 1303 /* add k_th block row to il, jl */ 1304 col = aj+jmin; 1305 if (*col == k) jmin++; 1306 if (jmin < jmax){ 1307 il[k] = jmin; 1308 j = a->j[jmin]; 1309 jl[k] = jl[j]; jl[j] = k; 1310 } 1311 for (j=0; j<bs; j++){ 1312 if (sum[j] > *norm) *norm = sum[j]; 1313 } 1314 } 1315 ierr = PetscFree(il);CHKERRQ(ierr); 1316 ierr = PetscFree(jl);CHKERRQ(ierr); 1317 ierr = PetscFree(sum);CHKERRQ(ierr); 1318 } else { 1319 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1320 } 1321 PetscFunctionReturn(0); 1322 } 1323 1324 #undef __FUNCT__ 1325 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1326 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1327 { 1328 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1329 int ierr; 1330 1331 PetscFunctionBegin; 1332 1333 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1334 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1335 *flg = PETSC_FALSE; 1336 PetscFunctionReturn(0); 1337 } 1338 1339 /* if the a->i are the same */ 1340 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1341 if (*flg == PETSC_FALSE) { 1342 PetscFunctionReturn(0); 1343 } 1344 1345 /* if a->j are the same */ 1346 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1347 if (*flg == PETSC_FALSE) { 1348 PetscFunctionReturn(0); 1349 } 1350 /* if a->a are the same */ 1351 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1352 1353 PetscFunctionReturn(0); 1354 } 1355 1356 #undef __FUNCT__ 1357 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1358 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1359 { 1360 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1361 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1362 PetscScalar *x,zero = 0.0; 1363 MatScalar *aa,*aa_j; 1364 1365 PetscFunctionBegin; 1366 bs = a->bs; 1367 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1368 1369 aa = a->a; 1370 ai = a->i; 1371 aj = a->j; 1372 ambs = a->mbs; 1373 bs2 = a->bs2; 1374 1375 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1376 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1377 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1378 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1379 for (i=0; i<ambs; i++) { 1380 j=ai[i]; 1381 if (aj[j] == i) { /* if this is a diagonal element */ 1382 row = i*bs; 1383 aa_j = aa + j*bs2; 1384 if (A->factor && bs==1){ 1385 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1386 } else { 1387 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1388 } 1389 } 1390 } 1391 1392 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1393 PetscFunctionReturn(0); 1394 } 1395 1396 #undef __FUNCT__ 1397 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1398 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1399 { 1400 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1401 PetscScalar *l,*r,x,*li,*ri; 1402 MatScalar *aa,*v; 1403 int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1404 1405 PetscFunctionBegin; 1406 ai = a->i; 1407 aj = a->j; 1408 aa = a->a; 1409 m = A->m; 1410 bs = a->bs; 1411 mbs = a->mbs; 1412 bs2 = a->bs2; 1413 1414 if (ll != rr) { 1415 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1416 } 1417 if (ll) { 1418 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1419 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1420 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1421 for (i=0; i<mbs; i++) { /* for each block row */ 1422 M = ai[i+1] - ai[i]; 1423 li = l + i*bs; 1424 v = aa + bs2*ai[i]; 1425 for (j=0; j<M; j++) { /* for each block */ 1426 for (k=0; k<bs2; k++) { 1427 (*v++) *= li[k%bs]; 1428 } 1429 #ifdef CONT 1430 /* will be used to replace the above loop */ 1431 ri = l + bs*aj[ai[i]+j]; 1432 for (k=0; k<bs; k++) { /* column value */ 1433 x = ri[k]; 1434 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1435 } 1436 #endif 1437 1438 } 1439 } 1440 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1441 PetscLogFlops(2*a->nz); 1442 } 1443 /* will be deleted */ 1444 if (rr) { 1445 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 1446 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1447 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1448 for (i=0; i<mbs; i++) { /* for each block row */ 1449 M = ai[i+1] - ai[i]; 1450 v = aa + bs2*ai[i]; 1451 for (j=0; j<M; j++) { /* for each block */ 1452 ri = r + bs*aj[ai[i]+j]; 1453 for (k=0; k<bs; k++) { 1454 x = ri[k]; 1455 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1456 } 1457 } 1458 } 1459 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 1460 PetscLogFlops(a->nz); 1461 } 1462 PetscFunctionReturn(0); 1463 } 1464 1465 #undef __FUNCT__ 1466 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1467 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1468 { 1469 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1470 1471 PetscFunctionBegin; 1472 info->rows_global = (double)A->m; 1473 info->columns_global = (double)A->m; 1474 info->rows_local = (double)A->m; 1475 info->columns_local = (double)A->m; 1476 info->block_size = a->bs2; 1477 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1478 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1479 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1480 info->assemblies = A->num_ass; 1481 info->mallocs = a->reallocs; 1482 info->memory = A->mem; 1483 if (A->factor) { 1484 info->fill_ratio_given = A->info.fill_ratio_given; 1485 info->fill_ratio_needed = A->info.fill_ratio_needed; 1486 info->factor_mallocs = A->info.factor_mallocs; 1487 } else { 1488 info->fill_ratio_given = 0; 1489 info->fill_ratio_needed = 0; 1490 info->factor_mallocs = 0; 1491 } 1492 PetscFunctionReturn(0); 1493 } 1494 1495 1496 #undef __FUNCT__ 1497 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1498 int MatZeroEntries_SeqSBAIJ(Mat A) 1499 { 1500 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1501 int ierr; 1502 1503 PetscFunctionBegin; 1504 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1505 PetscFunctionReturn(0); 1506 } 1507 1508 #undef __FUNCT__ 1509 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1510 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1511 { 1512 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1513 int ierr,i,j,n,row,col,bs,*ai,*aj,mbs; 1514 PetscReal atmp; 1515 MatScalar *aa; 1516 PetscScalar zero = 0.0,*x; 1517 int ncols,brow,bcol,krow,kcol; 1518 1519 PetscFunctionBegin; 1520 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1521 bs = a->bs; 1522 aa = a->a; 1523 ai = a->i; 1524 aj = a->j; 1525 mbs = a->mbs; 1526 1527 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1528 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1529 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1530 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1531 for (i=0; i<mbs; i++) { 1532 ncols = ai[1] - ai[0]; ai++; 1533 brow = bs*i; 1534 for (j=0; j<ncols; j++){ 1535 bcol = bs*(*aj); 1536 for (kcol=0; kcol<bs; kcol++){ 1537 col = bcol + kcol; /* col index */ 1538 for (krow=0; krow<bs; krow++){ 1539 atmp = PetscAbsScalar(*aa); aa++; 1540 row = brow + krow; /* row index */ 1541 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1542 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1543 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1544 } 1545 } 1546 aj++; 1547 } 1548 } 1549 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1550 PetscFunctionReturn(0); 1551 } 1552