1 #define PETSCMAT_DLL 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table,table0; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->bs; 24 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table0,bcol)){ 71 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2; 102 PetscInt *aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscTruth flag; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 110 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ssmap = smap; 117 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 118 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) { 138 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 139 } 140 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 141 C = *B; 142 } else { 143 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 144 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 145 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 146 } 147 c = (Mat_SeqSBAIJ *)(C->data); 148 for (i=0; i<nrows; i++) { 149 row = irow[i]; 150 kstart = ai[row]; 151 kend = kstart + a->ilen[row]; 152 mat_i = c->i[i]; 153 mat_j = c->j + mat_i; 154 mat_a = c->a + mat_i*bs2; 155 mat_ilen = c->ilen + i; 156 for (k=kstart; k<kend; k++) { 157 if ((tcol=ssmap[a->j[k]])) { 158 *mat_j++ = tcol - 1; 159 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 160 mat_a += bs2; 161 (*mat_ilen)++; 162 } 163 } 164 } 165 166 /* Free work space */ 167 ierr = PetscFree(smap);CHKERRQ(ierr); 168 ierr = PetscFree(lens);CHKERRQ(ierr); 169 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 170 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 171 172 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 173 *B = C; 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 179 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 180 { 181 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 182 IS is1; 183 PetscErrorCode ierr; 184 PetscInt *vary,*iary,*irow,nrows,i,bs=A->bs,count; 185 186 PetscFunctionBegin; 187 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 188 189 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 190 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 191 192 /* Verify if the indices corespond to each element in a block 193 and form the IS with compressed IS */ 194 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 195 iary = vary + a->mbs; 196 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 197 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 198 199 count = 0; 200 for (i=0; i<a->mbs; i++) { 201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 202 if (vary[i]==bs) iary[count++] = i; 203 } 204 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 205 206 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 207 ierr = PetscFree(vary);CHKERRQ(ierr); 208 209 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 210 ISDestroy(is1); 211 PetscFunctionReturn(0); 212 } 213 214 #undef __FUNCT__ 215 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 216 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 217 { 218 PetscErrorCode ierr; 219 PetscInt i; 220 221 PetscFunctionBegin; 222 if (scall == MAT_INITIAL_MATRIX) { 223 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 224 } 225 226 for (i=0; i<n; i++) { 227 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 228 } 229 PetscFunctionReturn(0); 230 } 231 232 /* -------------------------------------------------------*/ 233 /* Should check that shapes of vectors and matrices match */ 234 /* -------------------------------------------------------*/ 235 #include "petscblaslapack.h" 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 239 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 240 { 241 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 242 PetscScalar *x,*z,*xb,x1,zero=0.0; 243 MatScalar *v; 244 PetscErrorCode ierr; 245 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 246 247 PetscFunctionBegin; 248 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 249 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 250 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 251 252 v = a->a; 253 xb = x; 254 255 for (i=0; i<mbs; i++) { 256 n = ai[1] - ai[0]; /* length of i_th row of A */ 257 x1 = xb[0]; 258 ib = aj + *ai; 259 jmin = 0; 260 if (*ib == i) { /* (diag of A)*x */ 261 z[i] += *v++ * x[*ib++]; 262 jmin++; 263 } 264 for (j=jmin; j<n; j++) { 265 cval = *ib; 266 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 267 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 268 } 269 xb++; ai++; 270 } 271 272 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 273 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 274 ierr = PetscLogFlops(2*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); /* nz = (nz+m)/2 */ 275 PetscFunctionReturn(0); 276 } 277 278 #undef __FUNCT__ 279 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 280 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 281 { 282 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 283 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 284 MatScalar *v; 285 PetscErrorCode ierr; 286 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 287 288 PetscFunctionBegin; 289 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 290 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 291 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 292 293 v = a->a; 294 xb = x; 295 296 for (i=0; i<mbs; i++) { 297 n = ai[1] - ai[0]; /* length of i_th block row of A */ 298 x1 = xb[0]; x2 = xb[1]; 299 ib = aj + *ai; 300 jmin = 0; 301 if (*ib == i){ /* (diag of A)*x */ 302 z[2*i] += v[0]*x1 + v[2]*x2; 303 z[2*i+1] += v[2]*x1 + v[3]*x2; 304 v += 4; jmin++; 305 } 306 for (j=jmin; j<n; j++) { 307 /* (strict lower triangular part of A)*x */ 308 cval = ib[j]*2; 309 z[cval] += v[0]*x1 + v[1]*x2; 310 z[cval+1] += v[2]*x1 + v[3]*x2; 311 /* (strict upper triangular part of A)*x */ 312 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 313 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 314 v += 4; 315 } 316 xb +=2; ai++; 317 } 318 319 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 320 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 321 ierr = PetscLogFlops(8*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 322 PetscFunctionReturn(0); 323 } 324 325 #undef __FUNCT__ 326 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 327 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 328 { 329 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 330 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 331 MatScalar *v; 332 PetscErrorCode ierr; 333 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 334 335 PetscFunctionBegin; 336 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 337 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 338 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 339 340 v = a->a; 341 xb = x; 342 343 for (i=0; i<mbs; i++) { 344 n = ai[1] - ai[0]; /* length of i_th block row of A */ 345 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 346 ib = aj + *ai; 347 jmin = 0; 348 if (*ib == i){ /* (diag of A)*x */ 349 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 350 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 351 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 352 v += 9; jmin++; 353 } 354 for (j=jmin; j<n; j++) { 355 /* (strict lower triangular part of A)*x */ 356 cval = ib[j]*3; 357 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 358 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 359 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 360 /* (strict upper triangular part of A)*x */ 361 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 362 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 363 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 364 v += 9; 365 } 366 xb +=3; ai++; 367 } 368 369 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 370 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 371 ierr = PetscLogFlops(18*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 372 PetscFunctionReturn(0); 373 } 374 375 #undef __FUNCT__ 376 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 377 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 378 { 379 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 380 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 381 MatScalar *v; 382 PetscErrorCode ierr; 383 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 384 385 PetscFunctionBegin; 386 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 387 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 388 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 389 390 v = a->a; 391 xb = x; 392 393 for (i=0; i<mbs; i++) { 394 n = ai[1] - ai[0]; /* length of i_th block row of A */ 395 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 396 ib = aj + *ai; 397 jmin = 0; 398 if (*ib == i){ /* (diag of A)*x */ 399 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 400 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 401 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 402 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 403 v += 16; jmin++; 404 } 405 for (j=jmin; j<n; j++) { 406 /* (strict lower triangular part of A)*x */ 407 cval = ib[j]*4; 408 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 409 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 410 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 411 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 412 /* (strict upper triangular part of A)*x */ 413 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 414 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 415 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 416 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 417 v += 16; 418 } 419 xb +=4; ai++; 420 } 421 422 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 423 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 424 ierr = PetscLogFlops(32*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 425 PetscFunctionReturn(0); 426 } 427 428 #undef __FUNCT__ 429 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 430 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 431 { 432 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 433 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 434 MatScalar *v; 435 PetscErrorCode ierr; 436 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 437 438 PetscFunctionBegin; 439 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 440 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 441 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 442 443 v = a->a; 444 xb = x; 445 446 for (i=0; i<mbs; i++) { 447 n = ai[1] - ai[0]; /* length of i_th block row of A */ 448 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 449 ib = aj + *ai; 450 jmin = 0; 451 if (*ib == i){ /* (diag of A)*x */ 452 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 453 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 454 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 455 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 456 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 457 v += 25; jmin++; 458 } 459 for (j=jmin; j<n; j++) { 460 /* (strict lower triangular part of A)*x */ 461 cval = ib[j]*5; 462 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 463 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 464 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 465 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 466 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 467 /* (strict upper triangular part of A)*x */ 468 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 469 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 470 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 471 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 472 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 473 v += 25; 474 } 475 xb +=5; ai++; 476 } 477 478 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 479 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 480 ierr = PetscLogFlops(50*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 481 PetscFunctionReturn(0); 482 } 483 484 485 #undef __FUNCT__ 486 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 487 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 488 { 489 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 490 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 491 MatScalar *v; 492 PetscErrorCode ierr; 493 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 494 495 PetscFunctionBegin; 496 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 497 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 498 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 499 500 v = a->a; 501 xb = x; 502 503 for (i=0; i<mbs; i++) { 504 n = ai[1] - ai[0]; /* length of i_th block row of A */ 505 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 506 ib = aj + *ai; 507 jmin = 0; 508 if (*ib == i){ /* (diag of A)*x */ 509 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 510 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 511 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 512 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 513 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 514 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 515 v += 36; jmin++; 516 } 517 for (j=jmin; j<n; j++) { 518 /* (strict lower triangular part of A)*x */ 519 cval = ib[j]*6; 520 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 521 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 522 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 523 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 524 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 525 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 526 /* (strict upper triangular part of A)*x */ 527 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 528 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 529 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 530 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 531 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 532 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 533 v += 36; 534 } 535 xb +=6; ai++; 536 } 537 538 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 539 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 540 ierr = PetscLogFlops(72*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 541 PetscFunctionReturn(0); 542 } 543 #undef __FUNCT__ 544 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 545 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 546 { 547 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 548 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 549 MatScalar *v; 550 PetscErrorCode ierr; 551 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 552 553 PetscFunctionBegin; 554 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 555 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 556 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 557 558 v = a->a; 559 xb = x; 560 561 for (i=0; i<mbs; i++) { 562 n = ai[1] - ai[0]; /* length of i_th block row of A */ 563 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 564 ib = aj + *ai; 565 jmin = 0; 566 if (*ib == i){ /* (diag of A)*x */ 567 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 568 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 569 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 570 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 571 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 572 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 573 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 574 v += 49; jmin++; 575 } 576 for (j=jmin; j<n; j++) { 577 /* (strict lower triangular part of A)*x */ 578 cval = ib[j]*7; 579 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 580 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 581 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 582 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 583 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 584 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 585 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 586 /* (strict upper triangular part of A)*x */ 587 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 588 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 589 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 590 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 591 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 592 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 593 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 594 v += 49; 595 } 596 xb +=7; ai++; 597 } 598 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 599 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 600 ierr = PetscLogFlops(98*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 601 PetscFunctionReturn(0); 602 } 603 604 /* 605 This will not work with MatScalar == float because it calls the BLAS 606 */ 607 #undef __FUNCT__ 608 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 609 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 610 { 611 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 612 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 613 MatScalar *v; 614 PetscErrorCode ierr; 615 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 616 617 PetscFunctionBegin; 618 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 619 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 620 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 621 622 aj = a->j; 623 v = a->a; 624 ii = a->i; 625 626 if (!a->mult_work) { 627 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 628 } 629 work = a->mult_work; 630 631 for (i=0; i<mbs; i++) { 632 n = ii[1] - ii[0]; ncols = n*bs; 633 workt = work; idx=aj+ii[0]; 634 635 /* upper triangular part */ 636 for (j=0; j<n; j++) { 637 xb = x_ptr + bs*(*idx++); 638 for (k=0; k<bs; k++) workt[k] = xb[k]; 639 workt += bs; 640 } 641 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 642 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 643 644 /* strict lower triangular part */ 645 idx = aj+ii[0]; 646 if (*idx == i){ 647 ncols -= bs; v += bs2; idx++; n--; 648 } 649 650 if (ncols > 0){ 651 workt = work; 652 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 653 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 654 for (j=0; j<n; j++) { 655 zb = z_ptr + bs*(*idx++); 656 for (k=0; k<bs; k++) zb[k] += workt[k] ; 657 workt += bs; 658 } 659 } 660 x += bs; v += n*bs2; z += bs; ii++; 661 } 662 663 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 664 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 665 ierr = PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);CHKERRQ(ierr); 666 PetscFunctionReturn(0); 667 } 668 669 #undef __FUNCT__ 670 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 671 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 672 { 673 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 674 PetscScalar *x,*y,*z,*xb,x1; 675 MatScalar *v; 676 PetscErrorCode ierr; 677 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 678 679 PetscFunctionBegin; 680 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 681 if (yy != xx) { 682 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 683 } else { 684 y = x; 685 } 686 if (zz != yy) { 687 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 688 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 689 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 690 } else { 691 z = y; 692 } 693 694 v = a->a; 695 xb = x; 696 697 for (i=0; i<mbs; i++) { 698 n = ai[1] - ai[0]; /* length of i_th row of A */ 699 x1 = xb[0]; 700 ib = aj + *ai; 701 jmin = 0; 702 if (*ib == i) { /* (diag of A)*x */ 703 z[i] += *v++ * x[*ib++]; jmin++; 704 } 705 for (j=jmin; j<n; j++) { 706 cval = *ib; 707 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 708 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 709 } 710 xb++; ai++; 711 } 712 713 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 714 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 715 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 716 717 ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr); 718 PetscFunctionReturn(0); 719 } 720 721 #undef __FUNCT__ 722 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 723 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 724 { 725 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 726 PetscScalar *x,*y,*z,*xb,x1,x2; 727 MatScalar *v; 728 PetscErrorCode ierr; 729 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 730 731 PetscFunctionBegin; 732 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 733 if (yy != xx) { 734 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 735 } else { 736 y = x; 737 } 738 if (zz != yy) { 739 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 740 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 741 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 742 } else { 743 z = y; 744 } 745 746 v = a->a; 747 xb = x; 748 749 for (i=0; i<mbs; i++) { 750 n = ai[1] - ai[0]; /* length of i_th block row of A */ 751 x1 = xb[0]; x2 = xb[1]; 752 ib = aj + *ai; 753 jmin = 0; 754 if (*ib == i){ /* (diag of A)*x */ 755 z[2*i] += v[0]*x1 + v[2]*x2; 756 z[2*i+1] += v[2]*x1 + v[3]*x2; 757 v += 4; jmin++; 758 } 759 for (j=jmin; j<n; j++) { 760 /* (strict lower triangular part of A)*x */ 761 cval = ib[j]*2; 762 z[cval] += v[0]*x1 + v[1]*x2; 763 z[cval+1] += v[2]*x1 + v[3]*x2; 764 /* (strict upper triangular part of A)*x */ 765 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 766 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 767 v += 4; 768 } 769 xb +=2; ai++; 770 } 771 772 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 773 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 774 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 775 776 ierr = PetscLogFlops(4*(a->nz*2 - A->m));CHKERRQ(ierr); 777 PetscFunctionReturn(0); 778 } 779 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 782 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 783 { 784 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 785 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 786 MatScalar *v; 787 PetscErrorCode ierr; 788 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 789 790 PetscFunctionBegin; 791 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 792 if (yy != xx) { 793 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 794 } else { 795 y = x; 796 } 797 if (zz != yy) { 798 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 799 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 800 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 801 } else { 802 z = y; 803 } 804 805 v = a->a; 806 xb = x; 807 808 for (i=0; i<mbs; i++) { 809 n = ai[1] - ai[0]; /* length of i_th block row of A */ 810 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 811 ib = aj + *ai; 812 jmin = 0; 813 if (*ib == i){ /* (diag of A)*x */ 814 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 815 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 816 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 817 v += 9; jmin++; 818 } 819 for (j=jmin; j<n; j++) { 820 /* (strict lower triangular part of A)*x */ 821 cval = ib[j]*3; 822 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 823 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 824 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 825 /* (strict upper triangular part of A)*x */ 826 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 827 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 828 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 829 v += 9; 830 } 831 xb +=3; ai++; 832 } 833 834 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 835 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 836 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 837 838 ierr = PetscLogFlops(18*(a->nz*2 - A->m));CHKERRQ(ierr); 839 PetscFunctionReturn(0); 840 } 841 842 #undef __FUNCT__ 843 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 844 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 845 { 846 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 847 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 848 MatScalar *v; 849 PetscErrorCode ierr; 850 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 851 852 PetscFunctionBegin; 853 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 854 if (yy != xx) { 855 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 856 } else { 857 y = x; 858 } 859 if (zz != yy) { 860 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 861 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 862 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 863 } else { 864 z = y; 865 } 866 867 v = a->a; 868 xb = x; 869 870 for (i=0; i<mbs; i++) { 871 n = ai[1] - ai[0]; /* length of i_th block row of A */ 872 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 873 ib = aj + *ai; 874 jmin = 0; 875 if (*ib == i){ /* (diag of A)*x */ 876 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 877 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 878 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 879 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 880 v += 16; jmin++; 881 } 882 for (j=jmin; j<n; j++) { 883 /* (strict lower triangular part of A)*x */ 884 cval = ib[j]*4; 885 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 886 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 887 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 888 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 889 /* (strict upper triangular part of A)*x */ 890 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 891 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 892 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 893 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 894 v += 16; 895 } 896 xb +=4; ai++; 897 } 898 899 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 900 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 901 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 902 903 ierr = PetscLogFlops(32*(a->nz*2 - A->m));CHKERRQ(ierr); 904 PetscFunctionReturn(0); 905 } 906 907 #undef __FUNCT__ 908 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 909 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 910 { 911 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 912 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 913 MatScalar *v; 914 PetscErrorCode ierr; 915 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 916 917 PetscFunctionBegin; 918 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 919 if (yy != xx) { 920 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 921 } else { 922 y = x; 923 } 924 if (zz != yy) { 925 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 926 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 927 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 928 } else { 929 z = y; 930 } 931 932 v = a->a; 933 xb = x; 934 935 for (i=0; i<mbs; i++) { 936 n = ai[1] - ai[0]; /* length of i_th block row of A */ 937 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 938 ib = aj + *ai; 939 jmin = 0; 940 if (*ib == i){ /* (diag of A)*x */ 941 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 942 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 943 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 944 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 945 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 946 v += 25; jmin++; 947 } 948 for (j=jmin; j<n; j++) { 949 /* (strict lower triangular part of A)*x */ 950 cval = ib[j]*5; 951 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 952 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 953 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 954 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 955 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 956 /* (strict upper triangular part of A)*x */ 957 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 958 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 959 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 960 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 961 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 962 v += 25; 963 } 964 xb +=5; ai++; 965 } 966 967 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 968 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 969 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 970 971 ierr = PetscLogFlops(50*(a->nz*2 - A->m));CHKERRQ(ierr); 972 PetscFunctionReturn(0); 973 } 974 #undef __FUNCT__ 975 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 976 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 977 { 978 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 979 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 980 MatScalar *v; 981 PetscErrorCode ierr; 982 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 983 984 PetscFunctionBegin; 985 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 986 if (yy != xx) { 987 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 988 } else { 989 y = x; 990 } 991 if (zz != yy) { 992 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 993 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 994 } else { 995 z = y; 996 } 997 998 v = a->a; 999 xb = x; 1000 1001 for (i=0; i<mbs; i++) { 1002 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1003 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1004 ib = aj + *ai; 1005 jmin = 0; 1006 if (*ib == i){ /* (diag of A)*x */ 1007 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1008 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1009 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1010 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1011 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1012 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1013 v += 36; jmin++; 1014 } 1015 for (j=jmin; j<n; j++) { 1016 /* (strict lower triangular part of A)*x */ 1017 cval = ib[j]*6; 1018 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1019 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1020 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1021 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1022 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1023 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1024 /* (strict upper triangular part of A)*x */ 1025 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1026 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1027 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1028 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1029 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1030 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1031 v += 36; 1032 } 1033 xb +=6; ai++; 1034 } 1035 1036 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1037 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1038 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1039 1040 ierr = PetscLogFlops(72*(a->nz*2 - A->m));CHKERRQ(ierr); 1041 PetscFunctionReturn(0); 1042 } 1043 1044 #undef __FUNCT__ 1045 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1046 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1047 { 1048 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1049 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1050 MatScalar *v; 1051 PetscErrorCode ierr; 1052 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1053 1054 PetscFunctionBegin; 1055 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1056 if (yy != xx) { 1057 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1058 } else { 1059 y = x; 1060 } 1061 if (zz != yy) { 1062 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1063 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1064 } else { 1065 z = y; 1066 } 1067 1068 v = a->a; 1069 xb = x; 1070 1071 for (i=0; i<mbs; i++) { 1072 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1073 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1074 ib = aj + *ai; 1075 jmin = 0; 1076 if (*ib == i){ /* (diag of A)*x */ 1077 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1078 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1079 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1080 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1081 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1082 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1083 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1084 v += 49; jmin++; 1085 } 1086 for (j=jmin; j<n; j++) { 1087 /* (strict lower triangular part of A)*x */ 1088 cval = ib[j]*7; 1089 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1090 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1091 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1092 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1093 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1094 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1095 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1096 /* (strict upper triangular part of A)*x */ 1097 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1098 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1099 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1100 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1101 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1102 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1103 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1104 v += 49; 1105 } 1106 xb +=7; ai++; 1107 } 1108 1109 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1110 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1111 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1112 1113 ierr = PetscLogFlops(98*(a->nz*2 - A->m));CHKERRQ(ierr); 1114 PetscFunctionReturn(0); 1115 } 1116 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1119 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1120 { 1121 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1122 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1123 MatScalar *v; 1124 PetscErrorCode ierr; 1125 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 1126 1127 PetscFunctionBegin; 1128 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1129 if (yy != xx) { 1130 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1131 } else { 1132 y = x; 1133 } 1134 if (zz != yy) { 1135 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1136 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1137 } else { 1138 z = y; 1139 } 1140 1141 aj = a->j; 1142 v = a->a; 1143 ii = a->i; 1144 1145 if (!a->mult_work) { 1146 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1147 } 1148 work = a->mult_work; 1149 1150 1151 for (i=0; i<mbs; i++) { 1152 n = ii[1] - ii[0]; ncols = n*bs; 1153 workt = work; idx=aj+ii[0]; 1154 1155 /* upper triangular part */ 1156 for (j=0; j<n; j++) { 1157 xb = x_ptr + bs*(*idx++); 1158 for (k=0; k<bs; k++) workt[k] = xb[k]; 1159 workt += bs; 1160 } 1161 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1162 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1163 1164 /* strict lower triangular part */ 1165 idx = aj+ii[0]; 1166 if (*idx == i){ 1167 ncols -= bs; v += bs2; idx++; n--; 1168 } 1169 if (ncols > 0){ 1170 workt = work; 1171 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1172 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1173 for (j=0; j<n; j++) { 1174 zb = z_ptr + bs*(*idx++); 1175 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1176 workt += bs; 1177 } 1178 } 1179 1180 x += bs; v += n*bs2; z += bs; ii++; 1181 } 1182 1183 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1184 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1185 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1186 1187 ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr); 1188 PetscFunctionReturn(0); 1189 } 1190 1191 #undef __FUNCT__ 1192 #define __FUNCT__ "MatScale_SeqSBAIJ" 1193 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1194 { 1195 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1196 PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz; 1197 PetscErrorCode ierr; 1198 1199 PetscFunctionBegin; 1200 BLASscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1201 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1202 PetscFunctionReturn(0); 1203 } 1204 1205 #undef __FUNCT__ 1206 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1207 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1208 { 1209 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1210 MatScalar *v = a->a; 1211 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1212 PetscInt i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1213 PetscErrorCode ierr; 1214 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1215 1216 PetscFunctionBegin; 1217 if (type == NORM_FROBENIUS) { 1218 for (k=0; k<mbs; k++){ 1219 jmin = a->i[k]; jmax = a->i[k+1]; 1220 col = aj + jmin; 1221 if (*col == k){ /* diagonal block */ 1222 for (i=0; i<bs2; i++){ 1223 #if defined(PETSC_USE_COMPLEX) 1224 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1225 #else 1226 sum_diag += (*v)*(*v); v++; 1227 #endif 1228 } 1229 jmin++; 1230 } 1231 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1232 for (i=0; i<bs2; i++){ 1233 #if defined(PETSC_USE_COMPLEX) 1234 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1235 #else 1236 sum_off += (*v)*(*v); v++; 1237 #endif 1238 } 1239 } 1240 } 1241 *norm = sqrt(sum_diag + 2*sum_off); 1242 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1243 ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr); 1244 jl = il + mbs; 1245 sum = (PetscReal*)(jl + mbs); 1246 for (i=0; i<mbs; i++) jl[i] = mbs; 1247 il[0] = 0; 1248 1249 *norm = 0.0; 1250 for (k=0; k<mbs; k++) { /* k_th block row */ 1251 for (j=0; j<bs; j++) sum[j]=0.0; 1252 /*-- col sum --*/ 1253 i = jl[k]; /* first |A(i,k)| to be added */ 1254 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1255 at step k */ 1256 while (i<mbs){ 1257 nexti = jl[i]; /* next block row to be added */ 1258 ik = il[i]; /* block index of A(i,k) in the array a */ 1259 for (j=0; j<bs; j++){ 1260 v = a->a + ik*bs2 + j*bs; 1261 for (k1=0; k1<bs; k1++) { 1262 sum[j] += PetscAbsScalar(*v); v++; 1263 } 1264 } 1265 /* update il, jl */ 1266 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1267 jmax = a->i[i+1]; 1268 if (jmin < jmax){ 1269 il[i] = jmin; 1270 j = a->j[jmin]; 1271 jl[i] = jl[j]; jl[j]=i; 1272 } 1273 i = nexti; 1274 } 1275 /*-- row sum --*/ 1276 jmin = a->i[k]; jmax = a->i[k+1]; 1277 for (i=jmin; i<jmax; i++) { 1278 for (j=0; j<bs; j++){ 1279 v = a->a + i*bs2 + j; 1280 for (k1=0; k1<bs; k1++){ 1281 sum[j] += PetscAbsScalar(*v); v += bs; 1282 } 1283 } 1284 } 1285 /* add k_th block row to il, jl */ 1286 col = aj+jmin; 1287 if (*col == k) jmin++; 1288 if (jmin < jmax){ 1289 il[k] = jmin; 1290 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1291 } 1292 for (j=0; j<bs; j++){ 1293 if (sum[j] > *norm) *norm = sum[j]; 1294 } 1295 } 1296 ierr = PetscFree(il);CHKERRQ(ierr); 1297 } else { 1298 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1299 } 1300 PetscFunctionReturn(0); 1301 } 1302 1303 #undef __FUNCT__ 1304 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1305 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1306 { 1307 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1308 PetscErrorCode ierr; 1309 1310 PetscFunctionBegin; 1311 1312 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1313 if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) { 1314 *flg = PETSC_FALSE; 1315 PetscFunctionReturn(0); 1316 } 1317 1318 /* if the a->i are the same */ 1319 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1320 if (!*flg) { 1321 PetscFunctionReturn(0); 1322 } 1323 1324 /* if a->j are the same */ 1325 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1326 if (!*flg) { 1327 PetscFunctionReturn(0); 1328 } 1329 /* if a->a are the same */ 1330 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1331 PetscFunctionReturn(0); 1332 } 1333 1334 #undef __FUNCT__ 1335 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1336 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1337 { 1338 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1339 PetscErrorCode ierr; 1340 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1341 PetscScalar *x,zero = 0.0; 1342 MatScalar *aa,*aa_j; 1343 1344 PetscFunctionBegin; 1345 bs = A->bs; 1346 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1347 1348 aa = a->a; 1349 ai = a->i; 1350 aj = a->j; 1351 ambs = a->mbs; 1352 bs2 = a->bs2; 1353 1354 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1355 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1356 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1357 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1358 for (i=0; i<ambs; i++) { 1359 j=ai[i]; 1360 if (aj[j] == i) { /* if this is a diagonal element */ 1361 row = i*bs; 1362 aa_j = aa + j*bs2; 1363 if (A->factor && bs==1){ 1364 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1365 } else { 1366 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1367 } 1368 } 1369 } 1370 1371 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1372 PetscFunctionReturn(0); 1373 } 1374 1375 #undef __FUNCT__ 1376 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1377 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1378 { 1379 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1380 PetscScalar *l,x,*li,*ri; 1381 MatScalar *aa,*v; 1382 PetscErrorCode ierr; 1383 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1384 PetscTruth flg; 1385 1386 PetscFunctionBegin; 1387 if (ll != rr){ 1388 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1389 if (!flg) 1390 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1391 } 1392 if (!ll) PetscFunctionReturn(0); 1393 ai = a->i; 1394 aj = a->j; 1395 aa = a->a; 1396 m = A->m; 1397 bs = A->bs; 1398 mbs = a->mbs; 1399 bs2 = a->bs2; 1400 1401 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1402 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1403 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1404 for (i=0; i<mbs; i++) { /* for each block row */ 1405 M = ai[i+1] - ai[i]; 1406 li = l + i*bs; 1407 v = aa + bs2*ai[i]; 1408 for (j=0; j<M; j++) { /* for each block */ 1409 ri = l + bs*aj[ai[i]+j]; 1410 for (k=0; k<bs; k++) { 1411 x = ri[k]; 1412 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1413 } 1414 } 1415 } 1416 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1417 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 1418 PetscFunctionReturn(0); 1419 } 1420 1421 #undef __FUNCT__ 1422 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1423 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1424 { 1425 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1426 1427 PetscFunctionBegin; 1428 info->rows_global = (double)A->m; 1429 info->columns_global = (double)A->m; 1430 info->rows_local = (double)A->m; 1431 info->columns_local = (double)A->m; 1432 info->block_size = a->bs2; 1433 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1434 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1435 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1436 info->assemblies = A->num_ass; 1437 info->mallocs = a->reallocs; 1438 info->memory = A->mem; 1439 if (A->factor) { 1440 info->fill_ratio_given = A->info.fill_ratio_given; 1441 info->fill_ratio_needed = A->info.fill_ratio_needed; 1442 info->factor_mallocs = A->info.factor_mallocs; 1443 } else { 1444 info->fill_ratio_given = 0; 1445 info->fill_ratio_needed = 0; 1446 info->factor_mallocs = 0; 1447 } 1448 PetscFunctionReturn(0); 1449 } 1450 1451 1452 #undef __FUNCT__ 1453 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1454 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1455 { 1456 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1457 PetscErrorCode ierr; 1458 1459 PetscFunctionBegin; 1460 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1461 PetscFunctionReturn(0); 1462 } 1463 1464 #undef __FUNCT__ 1465 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1466 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1467 { 1468 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1469 PetscErrorCode ierr; 1470 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1471 PetscReal atmp; 1472 MatScalar *aa; 1473 PetscScalar zero = 0.0,*x; 1474 PetscInt ncols,brow,bcol,krow,kcol; 1475 1476 PetscFunctionBegin; 1477 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1478 bs = A->bs; 1479 aa = a->a; 1480 ai = a->i; 1481 aj = a->j; 1482 mbs = a->mbs; 1483 1484 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1485 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1486 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1487 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1488 for (i=0; i<mbs; i++) { 1489 ncols = ai[1] - ai[0]; ai++; 1490 brow = bs*i; 1491 for (j=0; j<ncols; j++){ 1492 bcol = bs*(*aj); 1493 for (kcol=0; kcol<bs; kcol++){ 1494 col = bcol + kcol; /* col index */ 1495 for (krow=0; krow<bs; krow++){ 1496 atmp = PetscAbsScalar(*aa); aa++; 1497 row = brow + krow; /* row index */ 1498 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1499 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1500 } 1501 } 1502 aj++; 1503 } 1504 } 1505 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1506 PetscFunctionReturn(0); 1507 } 1508