1 2 #include "src/mat/impls/baij/seq/baij.h" 3 #include "src/inline/spops.h" 4 #include "src/inline/ilu.h" 5 #include "petscbt.h" 6 #include "src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 PetscBT table,table0; 16 17 PetscFunctionBegin; 18 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 19 mbs = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->bs; 23 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 24 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 25 ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 26 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 27 28 for (i=0; i<is_max; i++) { /* for each is */ 29 isz = 0; 30 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 31 32 /* Extract the indices, assume there can be duplicate entries */ 33 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 34 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 35 36 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 37 bcol_max = 0; 38 for (j=0; j<n ; ++j){ 39 brow = idx[j]/bs; /* convert the indices into block indices */ 40 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41 if(!PetscBTLookupSet(table,brow)) { 42 nidx[isz++] = brow; 43 if (bcol_max < brow) bcol_max = brow; 44 } 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap */ 51 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 52 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 53 for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); } 54 55 n = isz; /* length of the updated is[i] */ 56 for (brow=0; brow<mbs; brow++){ 57 start = ai[brow]; end = ai[brow+1]; 58 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 59 for (l = start; l<end ; l++){ 60 bcol = aj[l]; 61 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 62 } 63 k++; 64 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 65 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 66 for (l = start; l<end ; l++){ 67 bcol = aj[l]; 68 if (bcol > bcol_max) break; 69 if (PetscBTLookup(table0,bcol)){ 70 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 71 break; /* for l = start; l<end ; l++) */ 72 } 73 } 74 } 75 } 76 } /* for each overlap */ 77 78 /* expand the Index Set */ 79 for (j=0; j<isz; j++) { 80 for (k=0; k<bs; k++) 81 nidx2[j*bs+k] = nidx[j]*bs+k; 82 } 83 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 84 } 85 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 86 ierr = PetscFree(nidx);CHKERRQ(ierr); 87 ierr = PetscFree(nidx2);CHKERRQ(ierr); 88 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 89 PetscFunctionReturn(0); 90 } 91 92 #undef __FUNCT__ 93 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 94 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 95 { 96 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 97 PetscErrorCode ierr; 98 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 99 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 100 PetscInt *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2; 101 PetscInt *aj = a->j,*ai = a->i; 102 MatScalar *mat_a; 103 Mat C; 104 PetscTruth flag; 105 106 PetscFunctionBegin; 107 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 108 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 109 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 110 111 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 112 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 113 114 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 115 ssmap = smap; 116 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 117 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 118 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 119 /* determine lens of each row */ 120 for (i=0; i<nrows; i++) { 121 kstart = ai[irow[i]]; 122 kend = kstart + a->ilen[irow[i]]; 123 lens[i] = 0; 124 for (k=kstart; k<kend; k++) { 125 if (ssmap[aj[k]]) { 126 lens[i]++; 127 } 128 } 129 } 130 /* Create and fill new matrix */ 131 if (scall == MAT_REUSE_MATRIX) { 132 c = (Mat_SeqSBAIJ *)((*B)->data); 133 134 if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 135 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 136 if (!flag) { 137 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 138 } 139 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 140 C = *B; 141 } else { 142 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 143 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 144 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 145 } 146 c = (Mat_SeqSBAIJ *)(C->data); 147 for (i=0; i<nrows; i++) { 148 row = irow[i]; 149 kstart = ai[row]; 150 kend = kstart + a->ilen[row]; 151 mat_i = c->i[i]; 152 mat_j = c->j + mat_i; 153 mat_a = c->a + mat_i*bs2; 154 mat_ilen = c->ilen + i; 155 for (k=kstart; k<kend; k++) { 156 if ((tcol=ssmap[a->j[k]])) { 157 *mat_j++ = tcol - 1; 158 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 159 mat_a += bs2; 160 (*mat_ilen)++; 161 } 162 } 163 } 164 165 /* Free work space */ 166 ierr = PetscFree(smap);CHKERRQ(ierr); 167 ierr = PetscFree(lens);CHKERRQ(ierr); 168 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 169 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 170 171 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 172 *B = C; 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 179 { 180 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 181 IS is1; 182 PetscErrorCode ierr; 183 PetscInt *vary,*iary,*irow,nrows,i,bs=A->bs,count; 184 185 PetscFunctionBegin; 186 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 187 188 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 189 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 190 191 /* Verify if the indices corespond to each element in a block 192 and form the IS with compressed IS */ 193 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 194 iary = vary + a->mbs; 195 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 196 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 197 198 count = 0; 199 for (i=0; i<a->mbs; i++) { 200 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 201 if (vary[i]==bs) iary[count++] = i; 202 } 203 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 204 205 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 206 ierr = PetscFree(vary);CHKERRQ(ierr); 207 208 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 209 ISDestroy(is1); 210 PetscFunctionReturn(0); 211 } 212 213 #undef __FUNCT__ 214 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 215 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 216 { 217 PetscErrorCode ierr; 218 PetscInt i; 219 220 PetscFunctionBegin; 221 if (scall == MAT_INITIAL_MATRIX) { 222 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 223 } 224 225 for (i=0; i<n; i++) { 226 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 /* -------------------------------------------------------*/ 232 /* Should check that shapes of vectors and matrices match */ 233 /* -------------------------------------------------------*/ 234 #include "petscblaslapack.h" 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 238 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 239 { 240 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 241 PetscScalar *x,*z,*xb,x1,zero=0.0; 242 MatScalar *v; 243 PetscErrorCode ierr; 244 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 245 246 PetscFunctionBegin; 247 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 248 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 250 251 v = a->a; 252 xb = x; 253 254 for (i=0; i<mbs; i++) { 255 n = ai[1] - ai[0]; /* length of i_th row of A */ 256 x1 = xb[0]; 257 ib = aj + *ai; 258 jmin = 0; 259 if (*ib == i) { /* (diag of A)*x */ 260 z[i] += *v++ * x[*ib++]; 261 jmin++; 262 } 263 for (j=jmin; j<n; j++) { 264 cval = *ib; 265 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 266 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 267 } 268 xb++; ai++; 269 } 270 271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 272 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 273 ierr = PetscLogFlops(2*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); /* nz = (nz+m)/2 */ 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 279 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 280 { 281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 282 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 283 MatScalar *v; 284 PetscErrorCode ierr; 285 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 286 287 PetscFunctionBegin; 288 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 289 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 290 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 291 292 v = a->a; 293 xb = x; 294 295 for (i=0; i<mbs; i++) { 296 n = ai[1] - ai[0]; /* length of i_th block row of A */ 297 x1 = xb[0]; x2 = xb[1]; 298 ib = aj + *ai; 299 jmin = 0; 300 if (*ib == i){ /* (diag of A)*x */ 301 z[2*i] += v[0]*x1 + v[2]*x2; 302 z[2*i+1] += v[2]*x1 + v[3]*x2; 303 v += 4; jmin++; 304 } 305 for (j=jmin; j<n; j++) { 306 /* (strict lower triangular part of A)*x */ 307 cval = ib[j]*2; 308 z[cval] += v[0]*x1 + v[1]*x2; 309 z[cval+1] += v[2]*x1 + v[3]*x2; 310 /* (strict upper triangular part of A)*x */ 311 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 312 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 313 v += 4; 314 } 315 xb +=2; ai++; 316 } 317 318 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 319 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 320 ierr = PetscLogFlops(8*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 321 PetscFunctionReturn(0); 322 } 323 324 #undef __FUNCT__ 325 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 326 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 327 { 328 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 329 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 330 MatScalar *v; 331 PetscErrorCode ierr; 332 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 333 334 PetscFunctionBegin; 335 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 336 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 337 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 338 339 v = a->a; 340 xb = x; 341 342 for (i=0; i<mbs; i++) { 343 n = ai[1] - ai[0]; /* length of i_th block row of A */ 344 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 345 ib = aj + *ai; 346 jmin = 0; 347 if (*ib == i){ /* (diag of A)*x */ 348 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 349 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 350 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 351 v += 9; jmin++; 352 } 353 for (j=jmin; j<n; j++) { 354 /* (strict lower triangular part of A)*x */ 355 cval = ib[j]*3; 356 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 357 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 358 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 359 /* (strict upper triangular part of A)*x */ 360 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 361 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 362 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 363 v += 9; 364 } 365 xb +=3; ai++; 366 } 367 368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 369 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 370 ierr = PetscLogFlops(18*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 371 PetscFunctionReturn(0); 372 } 373 374 #undef __FUNCT__ 375 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 376 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 377 { 378 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 379 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 380 MatScalar *v; 381 PetscErrorCode ierr; 382 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 383 384 PetscFunctionBegin; 385 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 386 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 387 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 388 389 v = a->a; 390 xb = x; 391 392 for (i=0; i<mbs; i++) { 393 n = ai[1] - ai[0]; /* length of i_th block row of A */ 394 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 395 ib = aj + *ai; 396 jmin = 0; 397 if (*ib == i){ /* (diag of A)*x */ 398 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 399 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 400 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 401 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 402 v += 16; jmin++; 403 } 404 for (j=jmin; j<n; j++) { 405 /* (strict lower triangular part of A)*x */ 406 cval = ib[j]*4; 407 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 408 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 409 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 410 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 411 /* (strict upper triangular part of A)*x */ 412 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 413 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 414 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 415 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 416 v += 16; 417 } 418 xb +=4; ai++; 419 } 420 421 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 422 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 423 ierr = PetscLogFlops(32*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 424 PetscFunctionReturn(0); 425 } 426 427 #undef __FUNCT__ 428 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 429 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 430 { 431 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 432 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 433 MatScalar *v; 434 PetscErrorCode ierr; 435 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 436 437 PetscFunctionBegin; 438 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 439 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 440 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 441 442 v = a->a; 443 xb = x; 444 445 for (i=0; i<mbs; i++) { 446 n = ai[1] - ai[0]; /* length of i_th block row of A */ 447 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 448 ib = aj + *ai; 449 jmin = 0; 450 if (*ib == i){ /* (diag of A)*x */ 451 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 452 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 453 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 454 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 455 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 456 v += 25; jmin++; 457 } 458 for (j=jmin; j<n; j++) { 459 /* (strict lower triangular part of A)*x */ 460 cval = ib[j]*5; 461 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 462 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 463 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 464 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 465 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 466 /* (strict upper triangular part of A)*x */ 467 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 468 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 469 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 470 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 471 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 472 v += 25; 473 } 474 xb +=5; ai++; 475 } 476 477 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 478 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 479 ierr = PetscLogFlops(50*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 480 PetscFunctionReturn(0); 481 } 482 483 484 #undef __FUNCT__ 485 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 486 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 487 { 488 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 489 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 490 MatScalar *v; 491 PetscErrorCode ierr; 492 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 493 494 PetscFunctionBegin; 495 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 496 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 497 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 498 499 v = a->a; 500 xb = x; 501 502 for (i=0; i<mbs; i++) { 503 n = ai[1] - ai[0]; /* length of i_th block row of A */ 504 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 505 ib = aj + *ai; 506 jmin = 0; 507 if (*ib == i){ /* (diag of A)*x */ 508 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 509 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 510 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 511 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 512 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 513 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 514 v += 36; jmin++; 515 } 516 for (j=jmin; j<n; j++) { 517 /* (strict lower triangular part of A)*x */ 518 cval = ib[j]*6; 519 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 520 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 521 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 522 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 523 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 524 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 525 /* (strict upper triangular part of A)*x */ 526 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 527 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 528 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 529 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 530 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 531 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 532 v += 36; 533 } 534 xb +=6; ai++; 535 } 536 537 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 538 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 539 ierr = PetscLogFlops(72*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 540 PetscFunctionReturn(0); 541 } 542 #undef __FUNCT__ 543 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 544 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 545 { 546 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 547 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 548 MatScalar *v; 549 PetscErrorCode ierr; 550 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 551 552 PetscFunctionBegin; 553 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 554 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 555 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 556 557 v = a->a; 558 xb = x; 559 560 for (i=0; i<mbs; i++) { 561 n = ai[1] - ai[0]; /* length of i_th block row of A */ 562 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 563 ib = aj + *ai; 564 jmin = 0; 565 if (*ib == i){ /* (diag of A)*x */ 566 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 567 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 568 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 569 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 570 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 571 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 572 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 573 v += 49; jmin++; 574 } 575 for (j=jmin; j<n; j++) { 576 /* (strict lower triangular part of A)*x */ 577 cval = ib[j]*7; 578 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 579 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 580 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 581 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 582 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 583 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 584 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 585 /* (strict upper triangular part of A)*x */ 586 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 587 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 588 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 589 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 590 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 591 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 592 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 593 v += 49; 594 } 595 xb +=7; ai++; 596 } 597 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 598 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 599 ierr = PetscLogFlops(98*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr); 600 PetscFunctionReturn(0); 601 } 602 603 /* 604 This will not work with MatScalar == float because it calls the BLAS 605 */ 606 #undef __FUNCT__ 607 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 608 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 609 { 610 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 611 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 612 MatScalar *v; 613 PetscErrorCode ierr; 614 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 615 616 PetscFunctionBegin; 617 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 618 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 619 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 620 621 aj = a->j; 622 v = a->a; 623 ii = a->i; 624 625 if (!a->mult_work) { 626 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 627 } 628 work = a->mult_work; 629 630 for (i=0; i<mbs; i++) { 631 n = ii[1] - ii[0]; ncols = n*bs; 632 workt = work; idx=aj+ii[0]; 633 634 /* upper triangular part */ 635 for (j=0; j<n; j++) { 636 xb = x_ptr + bs*(*idx++); 637 for (k=0; k<bs; k++) workt[k] = xb[k]; 638 workt += bs; 639 } 640 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 641 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 642 643 /* strict lower triangular part */ 644 idx = aj+ii[0]; 645 if (*idx == i){ 646 ncols -= bs; v += bs2; idx++; n--; 647 } 648 649 if (ncols > 0){ 650 workt = work; 651 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 652 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 653 for (j=0; j<n; j++) { 654 zb = z_ptr + bs*(*idx++); 655 for (k=0; k<bs; k++) zb[k] += workt[k] ; 656 workt += bs; 657 } 658 } 659 x += bs; v += n*bs2; z += bs; ii++; 660 } 661 662 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 663 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 664 ierr = PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);CHKERRQ(ierr); 665 PetscFunctionReturn(0); 666 } 667 668 #undef __FUNCT__ 669 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 670 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 671 { 672 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 673 PetscScalar *x,*y,*z,*xb,x1; 674 MatScalar *v; 675 PetscErrorCode ierr; 676 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 677 678 PetscFunctionBegin; 679 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 680 if (yy != xx) { 681 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 682 } else { 683 y = x; 684 } 685 if (zz != yy) { 686 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 687 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 688 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 689 } else { 690 z = y; 691 } 692 693 v = a->a; 694 xb = x; 695 696 for (i=0; i<mbs; i++) { 697 n = ai[1] - ai[0]; /* length of i_th row of A */ 698 x1 = xb[0]; 699 ib = aj + *ai; 700 jmin = 0; 701 if (*ib == i) { /* (diag of A)*x */ 702 z[i] += *v++ * x[*ib++]; jmin++; 703 } 704 for (j=jmin; j<n; j++) { 705 cval = *ib; 706 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 707 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 708 } 709 xb++; ai++; 710 } 711 712 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 713 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 714 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 715 716 ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr); 717 PetscFunctionReturn(0); 718 } 719 720 #undef __FUNCT__ 721 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 722 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 723 { 724 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 725 PetscScalar *x,*y,*z,*xb,x1,x2; 726 MatScalar *v; 727 PetscErrorCode ierr; 728 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 729 730 PetscFunctionBegin; 731 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 732 if (yy != xx) { 733 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 734 } else { 735 y = x; 736 } 737 if (zz != yy) { 738 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 739 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 740 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 741 } else { 742 z = y; 743 } 744 745 v = a->a; 746 xb = x; 747 748 for (i=0; i<mbs; i++) { 749 n = ai[1] - ai[0]; /* length of i_th block row of A */ 750 x1 = xb[0]; x2 = xb[1]; 751 ib = aj + *ai; 752 jmin = 0; 753 if (*ib == i){ /* (diag of A)*x */ 754 z[2*i] += v[0]*x1 + v[2]*x2; 755 z[2*i+1] += v[2]*x1 + v[3]*x2; 756 v += 4; jmin++; 757 } 758 for (j=jmin; j<n; j++) { 759 /* (strict lower triangular part of A)*x */ 760 cval = ib[j]*2; 761 z[cval] += v[0]*x1 + v[1]*x2; 762 z[cval+1] += v[2]*x1 + v[3]*x2; 763 /* (strict upper triangular part of A)*x */ 764 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 765 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 766 v += 4; 767 } 768 xb +=2; ai++; 769 } 770 771 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 772 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 773 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 774 775 ierr = PetscLogFlops(4*(a->nz*2 - A->m));CHKERRQ(ierr); 776 PetscFunctionReturn(0); 777 } 778 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 781 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 782 { 783 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 784 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 785 MatScalar *v; 786 PetscErrorCode ierr; 787 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 788 789 PetscFunctionBegin; 790 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 791 if (yy != xx) { 792 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 793 } else { 794 y = x; 795 } 796 if (zz != yy) { 797 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 798 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 799 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 800 } else { 801 z = y; 802 } 803 804 v = a->a; 805 xb = x; 806 807 for (i=0; i<mbs; i++) { 808 n = ai[1] - ai[0]; /* length of i_th block row of A */ 809 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 810 ib = aj + *ai; 811 jmin = 0; 812 if (*ib == i){ /* (diag of A)*x */ 813 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 814 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 815 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 816 v += 9; jmin++; 817 } 818 for (j=jmin; j<n; j++) { 819 /* (strict lower triangular part of A)*x */ 820 cval = ib[j]*3; 821 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 822 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 823 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 824 /* (strict upper triangular part of A)*x */ 825 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 826 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 827 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 828 v += 9; 829 } 830 xb +=3; ai++; 831 } 832 833 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 834 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 835 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 836 837 ierr = PetscLogFlops(18*(a->nz*2 - A->m));CHKERRQ(ierr); 838 PetscFunctionReturn(0); 839 } 840 841 #undef __FUNCT__ 842 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 843 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 844 { 845 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 846 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 847 MatScalar *v; 848 PetscErrorCode ierr; 849 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 850 851 PetscFunctionBegin; 852 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 853 if (yy != xx) { 854 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 855 } else { 856 y = x; 857 } 858 if (zz != yy) { 859 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 860 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 861 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 862 } else { 863 z = y; 864 } 865 866 v = a->a; 867 xb = x; 868 869 for (i=0; i<mbs; i++) { 870 n = ai[1] - ai[0]; /* length of i_th block row of A */ 871 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 872 ib = aj + *ai; 873 jmin = 0; 874 if (*ib == i){ /* (diag of A)*x */ 875 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 876 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 877 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 878 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 879 v += 16; jmin++; 880 } 881 for (j=jmin; j<n; j++) { 882 /* (strict lower triangular part of A)*x */ 883 cval = ib[j]*4; 884 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 885 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 886 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 887 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 888 /* (strict upper triangular part of A)*x */ 889 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 890 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 891 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 892 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 893 v += 16; 894 } 895 xb +=4; ai++; 896 } 897 898 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 899 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 900 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 901 902 ierr = PetscLogFlops(32*(a->nz*2 - A->m));CHKERRQ(ierr); 903 PetscFunctionReturn(0); 904 } 905 906 #undef __FUNCT__ 907 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 908 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 909 { 910 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 911 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 912 MatScalar *v; 913 PetscErrorCode ierr; 914 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 915 916 PetscFunctionBegin; 917 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 918 if (yy != xx) { 919 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 920 } else { 921 y = x; 922 } 923 if (zz != yy) { 924 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 925 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 926 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 927 } else { 928 z = y; 929 } 930 931 v = a->a; 932 xb = x; 933 934 for (i=0; i<mbs; i++) { 935 n = ai[1] - ai[0]; /* length of i_th block row of A */ 936 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 937 ib = aj + *ai; 938 jmin = 0; 939 if (*ib == i){ /* (diag of A)*x */ 940 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 941 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 942 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 943 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 944 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 945 v += 25; jmin++; 946 } 947 for (j=jmin; j<n; j++) { 948 /* (strict lower triangular part of A)*x */ 949 cval = ib[j]*5; 950 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 951 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 952 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 953 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 954 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 955 /* (strict upper triangular part of A)*x */ 956 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 957 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 958 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 959 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 960 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 961 v += 25; 962 } 963 xb +=5; ai++; 964 } 965 966 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 967 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 968 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 969 970 ierr = PetscLogFlops(50*(a->nz*2 - A->m));CHKERRQ(ierr); 971 PetscFunctionReturn(0); 972 } 973 #undef __FUNCT__ 974 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 975 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 976 { 977 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 978 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 979 MatScalar *v; 980 PetscErrorCode ierr; 981 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 982 983 PetscFunctionBegin; 984 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 985 if (yy != xx) { 986 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 987 } else { 988 y = x; 989 } 990 if (zz != yy) { 991 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 992 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 993 } else { 994 z = y; 995 } 996 997 v = a->a; 998 xb = x; 999 1000 for (i=0; i<mbs; i++) { 1001 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1002 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1003 ib = aj + *ai; 1004 jmin = 0; 1005 if (*ib == i){ /* (diag of A)*x */ 1006 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1007 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1008 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1009 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1010 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1011 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1012 v += 36; jmin++; 1013 } 1014 for (j=jmin; j<n; j++) { 1015 /* (strict lower triangular part of A)*x */ 1016 cval = ib[j]*6; 1017 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1018 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1019 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1020 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1021 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1022 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1023 /* (strict upper triangular part of A)*x */ 1024 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1025 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1026 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1027 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1028 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1029 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1030 v += 36; 1031 } 1032 xb +=6; ai++; 1033 } 1034 1035 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1036 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1037 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1038 1039 ierr = PetscLogFlops(72*(a->nz*2 - A->m));CHKERRQ(ierr); 1040 PetscFunctionReturn(0); 1041 } 1042 1043 #undef __FUNCT__ 1044 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1045 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1046 { 1047 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1048 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1049 MatScalar *v; 1050 PetscErrorCode ierr; 1051 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1052 1053 PetscFunctionBegin; 1054 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1055 if (yy != xx) { 1056 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1057 } else { 1058 y = x; 1059 } 1060 if (zz != yy) { 1061 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1062 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1063 } else { 1064 z = y; 1065 } 1066 1067 v = a->a; 1068 xb = x; 1069 1070 for (i=0; i<mbs; i++) { 1071 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1072 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1073 ib = aj + *ai; 1074 jmin = 0; 1075 if (*ib == i){ /* (diag of A)*x */ 1076 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1077 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1078 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1079 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1080 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1081 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1082 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1083 v += 49; jmin++; 1084 } 1085 for (j=jmin; j<n; j++) { 1086 /* (strict lower triangular part of A)*x */ 1087 cval = ib[j]*7; 1088 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1089 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1090 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1091 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1092 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1093 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1094 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1095 /* (strict upper triangular part of A)*x */ 1096 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1097 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1098 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1099 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1100 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1101 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1102 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1103 v += 49; 1104 } 1105 xb +=7; ai++; 1106 } 1107 1108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1109 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1110 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1111 1112 ierr = PetscLogFlops(98*(a->nz*2 - A->m));CHKERRQ(ierr); 1113 PetscFunctionReturn(0); 1114 } 1115 1116 #undef __FUNCT__ 1117 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1118 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1119 { 1120 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1121 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1122 MatScalar *v; 1123 PetscErrorCode ierr; 1124 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 1125 1126 PetscFunctionBegin; 1127 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1128 if (yy != xx) { 1129 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1130 } else { 1131 y = x; 1132 } 1133 if (zz != yy) { 1134 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1135 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1136 } else { 1137 z = y; 1138 } 1139 1140 aj = a->j; 1141 v = a->a; 1142 ii = a->i; 1143 1144 if (!a->mult_work) { 1145 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1146 } 1147 work = a->mult_work; 1148 1149 1150 for (i=0; i<mbs; i++) { 1151 n = ii[1] - ii[0]; ncols = n*bs; 1152 workt = work; idx=aj+ii[0]; 1153 1154 /* upper triangular part */ 1155 for (j=0; j<n; j++) { 1156 xb = x_ptr + bs*(*idx++); 1157 for (k=0; k<bs; k++) workt[k] = xb[k]; 1158 workt += bs; 1159 } 1160 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1161 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1162 1163 /* strict lower triangular part */ 1164 idx = aj+ii[0]; 1165 if (*idx == i){ 1166 ncols -= bs; v += bs2; idx++; n--; 1167 } 1168 if (ncols > 0){ 1169 workt = work; 1170 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1171 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1172 for (j=0; j<n; j++) { 1173 zb = z_ptr + bs*(*idx++); 1174 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1175 workt += bs; 1176 } 1177 } 1178 1179 x += bs; v += n*bs2; z += bs; ii++; 1180 } 1181 1182 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1183 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1184 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1185 1186 ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr); 1187 PetscFunctionReturn(0); 1188 } 1189 1190 #undef __FUNCT__ 1191 #define __FUNCT__ "MatScale_SeqSBAIJ" 1192 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1193 { 1194 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1195 PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz; 1196 PetscErrorCode ierr; 1197 1198 PetscFunctionBegin; 1199 BLASscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1200 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1201 PetscFunctionReturn(0); 1202 } 1203 1204 #undef __FUNCT__ 1205 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1206 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1207 { 1208 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1209 MatScalar *v = a->a; 1210 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1211 PetscInt i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1212 PetscErrorCode ierr; 1213 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1214 1215 PetscFunctionBegin; 1216 if (type == NORM_FROBENIUS) { 1217 for (k=0; k<mbs; k++){ 1218 jmin = a->i[k]; jmax = a->i[k+1]; 1219 col = aj + jmin; 1220 if (*col == k){ /* diagonal block */ 1221 for (i=0; i<bs2; i++){ 1222 #if defined(PETSC_USE_COMPLEX) 1223 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1224 #else 1225 sum_diag += (*v)*(*v); v++; 1226 #endif 1227 } 1228 jmin++; 1229 } 1230 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1231 for (i=0; i<bs2; i++){ 1232 #if defined(PETSC_USE_COMPLEX) 1233 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1234 #else 1235 sum_off += (*v)*(*v); v++; 1236 #endif 1237 } 1238 } 1239 } 1240 *norm = sqrt(sum_diag + 2*sum_off); 1241 1242 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1243 ierr = PetscMalloc(mbs*sizeof(PetscInt),&il);CHKERRQ(ierr); 1244 ierr = PetscMalloc(mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); 1245 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1246 for (i=0; i<mbs; i++) { 1247 jl[i] = mbs; il[0] = 0; 1248 } 1249 1250 *norm = 0.0; 1251 for (k=0; k<mbs; k++) { /* k_th block row */ 1252 for (j=0; j<bs; j++) sum[j]=0.0; 1253 1254 /*-- col sum --*/ 1255 i = jl[k]; /* first |A(i,k)| to be added */ 1256 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1257 at step k */ 1258 while (i<mbs){ 1259 nexti = jl[i]; /* next block row to be added */ 1260 ik = il[i]; /* block index of A(i,k) in the array a */ 1261 for (j=0; j<bs; j++){ 1262 v = a->a + ik*bs2 + j*bs; 1263 for (k1=0; k1<bs; k1++) { 1264 sum[j] += PetscAbsScalar(*v); v++; 1265 } 1266 } 1267 /* update il, jl */ 1268 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1269 jmax = a->i[i+1]; 1270 if (jmin < jmax){ 1271 il[i] = jmin; 1272 j = a->j[jmin]; 1273 jl[i] = jl[j]; jl[j]=i; 1274 } 1275 i = nexti; 1276 } 1277 1278 /*-- row sum --*/ 1279 jmin = a->i[k]; jmax = a->i[k+1]; 1280 for (i=jmin; i<jmax; i++) { 1281 for (j=0; j<bs; j++){ 1282 v = a->a + i*bs2 + j; 1283 for (k1=0; k1<bs; k1++){ 1284 sum[j] += PetscAbsScalar(*v); 1285 v += bs; 1286 } 1287 } 1288 } 1289 /* add k_th block row to il, jl */ 1290 col = aj+jmin; 1291 if (*col == k) jmin++; 1292 if (jmin < jmax){ 1293 il[k] = jmin; 1294 j = a->j[jmin]; 1295 jl[k] = jl[j]; jl[j] = k; 1296 } 1297 for (j=0; j<bs; j++){ 1298 if (sum[j] > *norm) *norm = sum[j]; 1299 } 1300 } 1301 ierr = PetscFree(il);CHKERRQ(ierr); 1302 ierr = PetscFree(jl);CHKERRQ(ierr); 1303 ierr = PetscFree(sum);CHKERRQ(ierr); 1304 } else { 1305 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1306 } 1307 PetscFunctionReturn(0); 1308 } 1309 1310 #undef __FUNCT__ 1311 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1312 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1313 { 1314 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1315 PetscErrorCode ierr; 1316 1317 PetscFunctionBegin; 1318 1319 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1320 if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) { 1321 *flg = PETSC_FALSE; 1322 PetscFunctionReturn(0); 1323 } 1324 1325 /* if the a->i are the same */ 1326 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1327 if (!*flg) { 1328 PetscFunctionReturn(0); 1329 } 1330 1331 /* if a->j are the same */ 1332 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1333 if (!*flg) { 1334 PetscFunctionReturn(0); 1335 } 1336 /* if a->a are the same */ 1337 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1338 PetscFunctionReturn(0); 1339 } 1340 1341 #undef __FUNCT__ 1342 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1343 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1344 { 1345 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1346 PetscErrorCode ierr; 1347 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1348 PetscScalar *x,zero = 0.0; 1349 MatScalar *aa,*aa_j; 1350 1351 PetscFunctionBegin; 1352 bs = A->bs; 1353 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1354 1355 aa = a->a; 1356 ai = a->i; 1357 aj = a->j; 1358 ambs = a->mbs; 1359 bs2 = a->bs2; 1360 1361 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1362 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1363 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1364 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1365 for (i=0; i<ambs; i++) { 1366 j=ai[i]; 1367 if (aj[j] == i) { /* if this is a diagonal element */ 1368 row = i*bs; 1369 aa_j = aa + j*bs2; 1370 if (A->factor && bs==1){ 1371 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1372 } else { 1373 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1374 } 1375 } 1376 } 1377 1378 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 #undef __FUNCT__ 1383 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1384 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1385 { 1386 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1387 PetscScalar *l,x,*li,*ri; 1388 MatScalar *aa,*v; 1389 PetscErrorCode ierr; 1390 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1391 PetscTruth flg; 1392 1393 PetscFunctionBegin; 1394 if (ll != rr){ 1395 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1396 if (!flg) 1397 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1398 } 1399 if (!ll) PetscFunctionReturn(0); 1400 ai = a->i; 1401 aj = a->j; 1402 aa = a->a; 1403 m = A->m; 1404 bs = A->bs; 1405 mbs = a->mbs; 1406 bs2 = a->bs2; 1407 1408 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1409 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1410 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1411 for (i=0; i<mbs; i++) { /* for each block row */ 1412 M = ai[i+1] - ai[i]; 1413 li = l + i*bs; 1414 v = aa + bs2*ai[i]; 1415 for (j=0; j<M; j++) { /* for each block */ 1416 ri = l + bs*aj[ai[i]+j]; 1417 for (k=0; k<bs; k++) { 1418 x = ri[k]; 1419 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1420 } 1421 } 1422 } 1423 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1424 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 1425 PetscFunctionReturn(0); 1426 } 1427 1428 #undef __FUNCT__ 1429 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1430 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1431 { 1432 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1433 1434 PetscFunctionBegin; 1435 info->rows_global = (double)A->m; 1436 info->columns_global = (double)A->m; 1437 info->rows_local = (double)A->m; 1438 info->columns_local = (double)A->m; 1439 info->block_size = a->bs2; 1440 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1441 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1442 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1443 info->assemblies = A->num_ass; 1444 info->mallocs = a->reallocs; 1445 info->memory = A->mem; 1446 if (A->factor) { 1447 info->fill_ratio_given = A->info.fill_ratio_given; 1448 info->fill_ratio_needed = A->info.fill_ratio_needed; 1449 info->factor_mallocs = A->info.factor_mallocs; 1450 } else { 1451 info->fill_ratio_given = 0; 1452 info->fill_ratio_needed = 0; 1453 info->factor_mallocs = 0; 1454 } 1455 PetscFunctionReturn(0); 1456 } 1457 1458 1459 #undef __FUNCT__ 1460 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1461 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1462 { 1463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1464 PetscErrorCode ierr; 1465 1466 PetscFunctionBegin; 1467 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1468 PetscFunctionReturn(0); 1469 } 1470 1471 #undef __FUNCT__ 1472 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1473 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1474 { 1475 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1476 PetscErrorCode ierr; 1477 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1478 PetscReal atmp; 1479 MatScalar *aa; 1480 PetscScalar zero = 0.0,*x; 1481 PetscInt ncols,brow,bcol,krow,kcol; 1482 1483 PetscFunctionBegin; 1484 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1485 bs = A->bs; 1486 aa = a->a; 1487 ai = a->i; 1488 aj = a->j; 1489 mbs = a->mbs; 1490 1491 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1492 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1493 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1494 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1495 for (i=0; i<mbs; i++) { 1496 ncols = ai[1] - ai[0]; ai++; 1497 brow = bs*i; 1498 for (j=0; j<ncols; j++){ 1499 bcol = bs*(*aj); 1500 for (kcol=0; kcol<bs; kcol++){ 1501 col = bcol + kcol; /* col index */ 1502 for (krow=0; krow<bs; krow++){ 1503 atmp = PetscAbsScalar(*aa); aa++; 1504 row = brow + krow; /* row index */ 1505 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1506 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1507 } 1508 } 1509 aj++; 1510 } 1511 } 1512 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1513 PetscFunctionReturn(0); 1514 } 1515