1 2 #include "src/mat/impls/baij/seq/baij.h" 3 #include "src/inline/spops.h" 4 #include "src/inline/ilu.h" 5 #include "petscbt.h" 6 #include "src/mat/impls/sbaij/seq/sbaij.h" 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 PetscBT table,table0; 16 17 PetscFunctionBegin; 18 mbs = a->mbs; 19 ai = a->i; 20 aj = a->j; 21 bs = A->bs; 22 23 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 25 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 26 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 27 ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 28 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { /* for each is */ 31 isz = 0; 32 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 33 34 /* Extract the indices, assume there can be duplicate entries */ 35 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 36 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 37 38 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 39 bcol_max = 0; 40 for (j=0; j<n ; ++j){ 41 brow = idx[j]/bs; /* convert the indices into block indices */ 42 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,brow)) { 44 nidx[isz++] = brow; 45 if (bcol_max < brow) bcol_max = brow; 46 } 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++){ /* for each overlap */ 53 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 54 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 55 for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]); 56 57 n = isz; /* length of the updated is[i] */ 58 for (brow=0; brow<mbs; brow++){ 59 start = ai[brow]; end = ai[brow+1]; 60 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 61 for (l = start; l<end ; l++){ 62 bcol = aj[l]; 63 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 64 } 65 k++; 66 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 67 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 68 for (l = start; l<end ; l++){ 69 bcol = aj[l]; 70 if (bcol > bcol_max) break; 71 if (PetscBTLookup(table0,bcol)){ 72 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 73 break; /* for l = start; l<end ; l++) */ 74 } 75 } 76 } 77 } 78 } /* for each overlap */ 79 80 /* expand the Index Set */ 81 for (j=0; j<isz; j++) { 82 for (k=0; k<bs; k++) 83 nidx2[j*bs+k] = nidx[j]*bs+k; 84 } 85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 86 } 87 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 88 ierr = PetscFree(nidx);CHKERRQ(ierr); 89 ierr = PetscFree(nidx2);CHKERRQ(ierr); 90 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 #undef __FUNCT__ 95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 97 { 98 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 99 PetscErrorCode ierr; 100 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 101 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 102 PetscInt *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2; 103 PetscInt *aj = a->j,*ai = a->i; 104 MatScalar *mat_a; 105 Mat C; 106 PetscTruth flag; 107 108 PetscFunctionBegin; 109 110 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 111 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 112 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 113 114 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 115 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 116 117 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 118 ssmap = smap; 119 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 120 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 121 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 122 /* determine lens of each row */ 123 for (i=0; i<nrows; i++) { 124 kstart = ai[irow[i]]; 125 kend = kstart + a->ilen[irow[i]]; 126 lens[i] = 0; 127 for (k=kstart; k<kend; k++) { 128 if (ssmap[aj[k]]) { 129 lens[i]++; 130 } 131 } 132 } 133 /* Create and fill new matrix */ 134 if (scall == MAT_REUSE_MATRIX) { 135 c = (Mat_SeqSBAIJ *)((*B)->data); 136 137 if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 138 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 139 if (flag == PETSC_FALSE) { 140 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 141 } 142 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 143 C = *B; 144 } else { 145 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 146 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 147 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 148 } 149 c = (Mat_SeqSBAIJ *)(C->data); 150 for (i=0; i<nrows; i++) { 151 row = irow[i]; 152 kstart = ai[row]; 153 kend = kstart + a->ilen[row]; 154 mat_i = c->i[i]; 155 mat_j = c->j + mat_i; 156 mat_a = c->a + mat_i*bs2; 157 mat_ilen = c->ilen + i; 158 for (k=kstart; k<kend; k++) { 159 if ((tcol=ssmap[a->j[k]])) { 160 *mat_j++ = tcol - 1; 161 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 162 mat_a += bs2; 163 (*mat_ilen)++; 164 } 165 } 166 } 167 168 /* Free work space */ 169 ierr = PetscFree(smap);CHKERRQ(ierr); 170 ierr = PetscFree(lens);CHKERRQ(ierr); 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 174 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 175 *B = C; 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B) 182 { 183 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 184 IS is1; 185 PetscErrorCode ierr; 186 PetscInt *vary,*iary,*irow,nrows,i,bs=A->bs,count; 187 188 PetscFunctionBegin; 189 if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 190 191 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 192 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 193 194 /* Verify if the indices corespond to each element in a block 195 and form the IS with compressed IS */ 196 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr); 197 iary = vary + a->mbs; 198 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 199 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 200 201 count = 0; 202 for (i=0; i<a->mbs; i++) { 203 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 204 if (vary[i]==bs) iary[count++] = i; 205 } 206 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 207 208 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 209 ierr = PetscFree(vary);CHKERRQ(ierr); 210 211 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 212 ISDestroy(is1); 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 219 { 220 PetscErrorCode ierr; 221 PetscInt i; 222 223 PetscFunctionBegin; 224 if (scall == MAT_INITIAL_MATRIX) { 225 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 226 } 227 228 for (i=0; i<n; i++) { 229 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 230 } 231 PetscFunctionReturn(0); 232 } 233 234 /* -------------------------------------------------------*/ 235 /* Should check that shapes of vectors and matrices match */ 236 /* -------------------------------------------------------*/ 237 #include "petscblaslapack.h" 238 239 #undef __FUNCT__ 240 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 241 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 242 { 243 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 244 PetscScalar *x,*z,*xb,x1,zero=0.0; 245 MatScalar *v; 246 PetscErrorCode ierr; 247 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 248 249 PetscFunctionBegin; 250 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 251 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 252 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 253 254 v = a->a; 255 xb = x; 256 257 for (i=0; i<mbs; i++) { 258 n = ai[1] - ai[0]; /* length of i_th row of A */ 259 x1 = xb[0]; 260 ib = aj + *ai; 261 jmin = 0; 262 if (*ib == i) { /* (diag of A)*x */ 263 z[i] += *v++ * x[*ib++]; 264 jmin++; 265 } 266 for (j=jmin; j<n; j++) { 267 cval = *ib; 268 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 269 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 270 } 271 xb++; ai++; 272 } 273 274 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 275 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 276 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 277 PetscFunctionReturn(0); 278 } 279 280 #undef __FUNCT__ 281 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 282 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 283 { 284 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 285 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 286 MatScalar *v; 287 PetscErrorCode ierr; 288 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 289 290 291 PetscFunctionBegin; 292 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 293 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 294 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 295 296 v = a->a; 297 xb = x; 298 299 for (i=0; i<mbs; i++) { 300 n = ai[1] - ai[0]; /* length of i_th block row of A */ 301 x1 = xb[0]; x2 = xb[1]; 302 ib = aj + *ai; 303 jmin = 0; 304 if (*ib == i){ /* (diag of A)*x */ 305 z[2*i] += v[0]*x1 + v[2]*x2; 306 z[2*i+1] += v[2]*x1 + v[3]*x2; 307 v += 4; jmin++; 308 } 309 for (j=jmin; j<n; j++) { 310 /* (strict lower triangular part of A)*x */ 311 cval = ib[j]*2; 312 z[cval] += v[0]*x1 + v[1]*x2; 313 z[cval+1] += v[2]*x1 + v[3]*x2; 314 /* (strict upper triangular part of A)*x */ 315 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 316 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 317 v += 4; 318 } 319 xb +=2; ai++; 320 } 321 322 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 323 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 324 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 325 PetscFunctionReturn(0); 326 } 327 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 330 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 331 { 332 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 333 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 334 MatScalar *v; 335 PetscErrorCode ierr; 336 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 337 338 339 PetscFunctionBegin; 340 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 341 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 342 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 343 344 v = a->a; 345 xb = x; 346 347 for (i=0; i<mbs; i++) { 348 n = ai[1] - ai[0]; /* length of i_th block row of A */ 349 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 350 ib = aj + *ai; 351 jmin = 0; 352 if (*ib == i){ /* (diag of A)*x */ 353 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 354 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 355 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 356 v += 9; jmin++; 357 } 358 for (j=jmin; j<n; j++) { 359 /* (strict lower triangular part of A)*x */ 360 cval = ib[j]*3; 361 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 362 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 363 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 364 /* (strict upper triangular part of A)*x */ 365 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 366 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 367 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 368 v += 9; 369 } 370 xb +=3; ai++; 371 } 372 373 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 374 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 375 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 376 PetscFunctionReturn(0); 377 } 378 379 #undef __FUNCT__ 380 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 381 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 382 { 383 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 384 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 385 MatScalar *v; 386 PetscErrorCode ierr; 387 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 388 389 PetscFunctionBegin; 390 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 391 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 392 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 393 394 v = a->a; 395 xb = x; 396 397 for (i=0; i<mbs; i++) { 398 n = ai[1] - ai[0]; /* length of i_th block row of A */ 399 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 400 ib = aj + *ai; 401 jmin = 0; 402 if (*ib == i){ /* (diag of A)*x */ 403 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 404 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 405 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 406 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 407 v += 16; jmin++; 408 } 409 for (j=jmin; j<n; j++) { 410 /* (strict lower triangular part of A)*x */ 411 cval = ib[j]*4; 412 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 413 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 414 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 415 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 416 /* (strict upper triangular part of A)*x */ 417 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 418 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 419 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 420 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 421 v += 16; 422 } 423 xb +=4; ai++; 424 } 425 426 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 427 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 428 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 429 PetscFunctionReturn(0); 430 } 431 432 #undef __FUNCT__ 433 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 434 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 435 { 436 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 437 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 438 MatScalar *v; 439 PetscErrorCode ierr; 440 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 441 442 PetscFunctionBegin; 443 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 444 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 445 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 446 447 v = a->a; 448 xb = x; 449 450 for (i=0; i<mbs; i++) { 451 n = ai[1] - ai[0]; /* length of i_th block row of A */ 452 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 453 ib = aj + *ai; 454 jmin = 0; 455 if (*ib == i){ /* (diag of A)*x */ 456 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 457 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 458 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 459 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 460 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 461 v += 25; jmin++; 462 } 463 for (j=jmin; j<n; j++) { 464 /* (strict lower triangular part of A)*x */ 465 cval = ib[j]*5; 466 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 467 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 468 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 469 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 470 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 471 /* (strict upper triangular part of A)*x */ 472 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 473 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 474 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 475 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 476 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 477 v += 25; 478 } 479 xb +=5; ai++; 480 } 481 482 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 483 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 484 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 485 PetscFunctionReturn(0); 486 } 487 488 489 #undef __FUNCT__ 490 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 491 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 492 { 493 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 494 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 495 MatScalar *v; 496 PetscErrorCode ierr; 497 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 498 499 PetscFunctionBegin; 500 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 501 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 502 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 503 504 v = a->a; 505 xb = x; 506 507 for (i=0; i<mbs; i++) { 508 n = ai[1] - ai[0]; /* length of i_th block row of A */ 509 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 510 ib = aj + *ai; 511 jmin = 0; 512 if (*ib == i){ /* (diag of A)*x */ 513 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 514 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 515 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 516 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 517 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 518 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 519 v += 36; jmin++; 520 } 521 for (j=jmin; j<n; j++) { 522 /* (strict lower triangular part of A)*x */ 523 cval = ib[j]*6; 524 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 525 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 526 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 527 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 528 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 529 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 530 /* (strict upper triangular part of A)*x */ 531 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 532 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 533 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 534 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 535 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 536 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 537 v += 36; 538 } 539 xb +=6; ai++; 540 } 541 542 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 543 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 544 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 545 PetscFunctionReturn(0); 546 } 547 #undef __FUNCT__ 548 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 549 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 550 { 551 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 552 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 553 MatScalar *v; 554 PetscErrorCode ierr; 555 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 556 557 PetscFunctionBegin; 558 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 559 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 560 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 561 562 v = a->a; 563 xb = x; 564 565 for (i=0; i<mbs; i++) { 566 n = ai[1] - ai[0]; /* length of i_th block row of A */ 567 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 568 ib = aj + *ai; 569 jmin = 0; 570 if (*ib == i){ /* (diag of A)*x */ 571 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 572 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 573 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 574 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 575 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 576 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 577 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 578 v += 49; jmin++; 579 } 580 for (j=jmin; j<n; j++) { 581 /* (strict lower triangular part of A)*x */ 582 cval = ib[j]*7; 583 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 584 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 585 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 586 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 587 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 588 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 589 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 590 /* (strict upper triangular part of A)*x */ 591 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 592 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 593 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 594 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 595 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 596 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 597 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 598 v += 49; 599 } 600 xb +=7; ai++; 601 } 602 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 603 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 604 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 605 PetscFunctionReturn(0); 606 } 607 608 /* 609 This will not work with MatScalar == float because it calls the BLAS 610 */ 611 #undef __FUNCT__ 612 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 613 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 614 { 615 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 616 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 617 MatScalar *v; 618 PetscErrorCode ierr; 619 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 620 621 PetscFunctionBegin; 622 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 623 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 624 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 625 626 aj = a->j; 627 v = a->a; 628 ii = a->i; 629 630 if (!a->mult_work) { 631 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 632 } 633 work = a->mult_work; 634 635 for (i=0; i<mbs; i++) { 636 n = ii[1] - ii[0]; ncols = n*bs; 637 workt = work; idx=aj+ii[0]; 638 639 /* upper triangular part */ 640 for (j=0; j<n; j++) { 641 xb = x_ptr + bs*(*idx++); 642 for (k=0; k<bs; k++) workt[k] = xb[k]; 643 workt += bs; 644 } 645 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 646 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 647 648 /* strict lower triangular part */ 649 idx = aj+ii[0]; 650 if (*idx == i){ 651 ncols -= bs; v += bs2; idx++; n--; 652 } 653 654 if (ncols > 0){ 655 workt = work; 656 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 657 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 658 for (j=0; j<n; j++) { 659 zb = z_ptr + bs*(*idx++); 660 for (k=0; k<bs; k++) zb[k] += workt[k] ; 661 workt += bs; 662 } 663 } 664 x += bs; v += n*bs2; z += bs; ii++; 665 } 666 667 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 668 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 669 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 670 PetscFunctionReturn(0); 671 } 672 673 #undef __FUNCT__ 674 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 675 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 676 { 677 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 678 PetscScalar *x,*y,*z,*xb,x1; 679 MatScalar *v; 680 PetscErrorCode ierr; 681 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 682 683 PetscFunctionBegin; 684 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 685 if (yy != xx) { 686 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 687 } else { 688 y = x; 689 } 690 if (zz != yy) { 691 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 692 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 693 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 694 } else { 695 z = y; 696 } 697 698 v = a->a; 699 xb = x; 700 701 for (i=0; i<mbs; i++) { 702 n = ai[1] - ai[0]; /* length of i_th row of A */ 703 x1 = xb[0]; 704 ib = aj + *ai; 705 jmin = 0; 706 if (*ib == i) { /* (diag of A)*x */ 707 z[i] += *v++ * x[*ib++]; jmin++; 708 } 709 for (j=jmin; j<n; j++) { 710 cval = *ib; 711 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 712 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 713 } 714 xb++; ai++; 715 } 716 717 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 718 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 719 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 720 721 PetscLogFlops(2*(a->nz*2 - A->m)); 722 PetscFunctionReturn(0); 723 } 724 725 #undef __FUNCT__ 726 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 727 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 728 { 729 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 730 PetscScalar *x,*y,*z,*xb,x1,x2; 731 MatScalar *v; 732 PetscErrorCode ierr; 733 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 734 735 PetscFunctionBegin; 736 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 737 if (yy != xx) { 738 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 739 } else { 740 y = x; 741 } 742 if (zz != yy) { 743 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 744 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 745 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 746 } else { 747 z = y; 748 } 749 750 v = a->a; 751 xb = x; 752 753 for (i=0; i<mbs; i++) { 754 n = ai[1] - ai[0]; /* length of i_th block row of A */ 755 x1 = xb[0]; x2 = xb[1]; 756 ib = aj + *ai; 757 jmin = 0; 758 if (*ib == i){ /* (diag of A)*x */ 759 z[2*i] += v[0]*x1 + v[2]*x2; 760 z[2*i+1] += v[2]*x1 + v[3]*x2; 761 v += 4; jmin++; 762 } 763 for (j=jmin; j<n; j++) { 764 /* (strict lower triangular part of A)*x */ 765 cval = ib[j]*2; 766 z[cval] += v[0]*x1 + v[1]*x2; 767 z[cval+1] += v[2]*x1 + v[3]*x2; 768 /* (strict upper triangular part of A)*x */ 769 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 770 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 771 v += 4; 772 } 773 xb +=2; ai++; 774 } 775 776 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 777 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 778 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 779 780 PetscLogFlops(4*(a->nz*2 - A->m)); 781 PetscFunctionReturn(0); 782 } 783 784 #undef __FUNCT__ 785 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 786 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 787 { 788 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 789 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 790 MatScalar *v; 791 PetscErrorCode ierr; 792 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 793 794 PetscFunctionBegin; 795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 796 if (yy != xx) { 797 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 798 } else { 799 y = x; 800 } 801 if (zz != yy) { 802 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 803 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 804 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 805 } else { 806 z = y; 807 } 808 809 v = a->a; 810 xb = x; 811 812 for (i=0; i<mbs; i++) { 813 n = ai[1] - ai[0]; /* length of i_th block row of A */ 814 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 815 ib = aj + *ai; 816 jmin = 0; 817 if (*ib == i){ /* (diag of A)*x */ 818 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 819 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 820 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 821 v += 9; jmin++; 822 } 823 for (j=jmin; j<n; j++) { 824 /* (strict lower triangular part of A)*x */ 825 cval = ib[j]*3; 826 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 827 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 828 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 829 /* (strict upper triangular part of A)*x */ 830 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 831 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 832 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 833 v += 9; 834 } 835 xb +=3; ai++; 836 } 837 838 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 839 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 840 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 841 842 PetscLogFlops(18*(a->nz*2 - A->m)); 843 PetscFunctionReturn(0); 844 } 845 846 #undef __FUNCT__ 847 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 848 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 849 { 850 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 851 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 852 MatScalar *v; 853 PetscErrorCode ierr; 854 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 855 856 PetscFunctionBegin; 857 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 858 if (yy != xx) { 859 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 860 } else { 861 y = x; 862 } 863 if (zz != yy) { 864 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 865 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 866 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 867 } else { 868 z = y; 869 } 870 871 v = a->a; 872 xb = x; 873 874 for (i=0; i<mbs; i++) { 875 n = ai[1] - ai[0]; /* length of i_th block row of A */ 876 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 877 ib = aj + *ai; 878 jmin = 0; 879 if (*ib == i){ /* (diag of A)*x */ 880 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 881 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 882 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 883 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 884 v += 16; jmin++; 885 } 886 for (j=jmin; j<n; j++) { 887 /* (strict lower triangular part of A)*x */ 888 cval = ib[j]*4; 889 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 890 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 891 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 892 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 893 /* (strict upper triangular part of A)*x */ 894 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 895 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 896 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 897 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 898 v += 16; 899 } 900 xb +=4; ai++; 901 } 902 903 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 904 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 905 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 906 907 PetscLogFlops(32*(a->nz*2 - A->m)); 908 PetscFunctionReturn(0); 909 } 910 911 #undef __FUNCT__ 912 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 913 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 914 { 915 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 916 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 917 MatScalar *v; 918 PetscErrorCode ierr; 919 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 920 921 PetscFunctionBegin; 922 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 923 if (yy != xx) { 924 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 925 } else { 926 y = x; 927 } 928 if (zz != yy) { 929 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 930 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 931 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 932 } else { 933 z = y; 934 } 935 936 v = a->a; 937 xb = x; 938 939 for (i=0; i<mbs; i++) { 940 n = ai[1] - ai[0]; /* length of i_th block row of A */ 941 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 942 ib = aj + *ai; 943 jmin = 0; 944 if (*ib == i){ /* (diag of A)*x */ 945 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 946 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 947 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 948 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 949 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 950 v += 25; jmin++; 951 } 952 for (j=jmin; j<n; j++) { 953 /* (strict lower triangular part of A)*x */ 954 cval = ib[j]*5; 955 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 956 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 957 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 958 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 959 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 960 /* (strict upper triangular part of A)*x */ 961 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 962 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 963 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 964 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 965 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 966 v += 25; 967 } 968 xb +=5; ai++; 969 } 970 971 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 972 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 973 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 974 975 PetscLogFlops(50*(a->nz*2 - A->m)); 976 PetscFunctionReturn(0); 977 } 978 #undef __FUNCT__ 979 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 980 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 981 { 982 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 983 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 984 MatScalar *v; 985 PetscErrorCode ierr; 986 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 987 988 PetscFunctionBegin; 989 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 990 if (yy != xx) { 991 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 992 } else { 993 y = x; 994 } 995 if (zz != yy) { 996 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 997 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 998 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 999 } else { 1000 z = y; 1001 } 1002 1003 v = a->a; 1004 xb = x; 1005 1006 for (i=0; i<mbs; i++) { 1007 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1008 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 1009 ib = aj + *ai; 1010 jmin = 0; 1011 if (*ib == i){ /* (diag of A)*x */ 1012 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 1013 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 1014 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 1015 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 1016 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 1017 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1018 v += 36; jmin++; 1019 } 1020 for (j=jmin; j<n; j++) { 1021 /* (strict lower triangular part of A)*x */ 1022 cval = ib[j]*6; 1023 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1024 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1025 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1026 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1027 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1028 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1029 /* (strict upper triangular part of A)*x */ 1030 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1031 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1032 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1033 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1034 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1035 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1036 v += 36; 1037 } 1038 xb +=6; ai++; 1039 } 1040 1041 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1042 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1043 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1044 1045 PetscLogFlops(72*(a->nz*2 - A->m)); 1046 PetscFunctionReturn(0); 1047 } 1048 1049 #undef __FUNCT__ 1050 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1051 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1052 { 1053 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1054 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1055 MatScalar *v; 1056 PetscErrorCode ierr; 1057 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 1058 1059 PetscFunctionBegin; 1060 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1061 if (yy != xx) { 1062 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1063 } else { 1064 y = x; 1065 } 1066 if (zz != yy) { 1067 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1068 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1069 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1070 } else { 1071 z = y; 1072 } 1073 1074 v = a->a; 1075 xb = x; 1076 1077 for (i=0; i<mbs; i++) { 1078 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1079 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1080 ib = aj + *ai; 1081 jmin = 0; 1082 if (*ib == i){ /* (diag of A)*x */ 1083 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1084 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1085 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1086 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1087 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1088 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1089 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1090 v += 49; jmin++; 1091 } 1092 for (j=jmin; j<n; j++) { 1093 /* (strict lower triangular part of A)*x */ 1094 cval = ib[j]*7; 1095 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1096 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1097 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1098 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1099 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1100 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1101 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1102 /* (strict upper triangular part of A)*x */ 1103 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1104 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1105 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1106 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1107 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1108 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1109 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1110 v += 49; 1111 } 1112 xb +=7; ai++; 1113 } 1114 1115 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1116 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1117 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1118 1119 PetscLogFlops(98*(a->nz*2 - A->m)); 1120 PetscFunctionReturn(0); 1121 } 1122 1123 #undef __FUNCT__ 1124 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1125 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1126 { 1127 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1128 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1129 MatScalar *v; 1130 PetscErrorCode ierr; 1131 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k; 1132 1133 PetscFunctionBegin; 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1135 if (yy != xx) { 1136 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1137 } else { 1138 y = x; 1139 } 1140 if (zz != yy) { 1141 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1142 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1143 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1144 } else { 1145 z = y; 1146 } 1147 1148 aj = a->j; 1149 v = a->a; 1150 ii = a->i; 1151 1152 if (!a->mult_work) { 1153 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1154 } 1155 work = a->mult_work; 1156 1157 1158 for (i=0; i<mbs; i++) { 1159 n = ii[1] - ii[0]; ncols = n*bs; 1160 workt = work; idx=aj+ii[0]; 1161 1162 /* upper triangular part */ 1163 for (j=0; j<n; j++) { 1164 xb = x_ptr + bs*(*idx++); 1165 for (k=0; k<bs; k++) workt[k] = xb[k]; 1166 workt += bs; 1167 } 1168 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1169 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1170 1171 /* strict lower triangular part */ 1172 idx = aj+ii[0]; 1173 if (*idx == i){ 1174 ncols -= bs; v += bs2; idx++; n--; 1175 } 1176 if (ncols > 0){ 1177 workt = work; 1178 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1179 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1180 for (j=0; j<n; j++) { 1181 zb = z_ptr + bs*(*idx++); 1182 /* idx++; */ 1183 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1184 workt += bs; 1185 } 1186 } 1187 1188 x += bs; v += n*bs2; z += bs; ii++; 1189 } 1190 1191 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1192 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1193 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1194 1195 PetscLogFlops(2*(a->nz*2 - A->m)); 1196 PetscFunctionReturn(0); 1197 } 1198 1199 #undef __FUNCT__ 1200 #define __FUNCT__ "MatScale_SeqSBAIJ" 1201 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1202 { 1203 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1204 PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz; 1205 1206 PetscFunctionBegin; 1207 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1208 PetscLogFlops(totalnz); 1209 PetscFunctionReturn(0); 1210 } 1211 1212 #undef __FUNCT__ 1213 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1214 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1215 { 1216 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1217 MatScalar *v = a->a; 1218 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1219 PetscInt i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1220 PetscErrorCode ierr; 1221 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1222 1223 PetscFunctionBegin; 1224 if (type == NORM_FROBENIUS) { 1225 for (k=0; k<mbs; k++){ 1226 jmin = a->i[k]; jmax = a->i[k+1]; 1227 col = aj + jmin; 1228 if (*col == k){ /* diagonal block */ 1229 for (i=0; i<bs2; i++){ 1230 #if defined(PETSC_USE_COMPLEX) 1231 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1232 #else 1233 sum_diag += (*v)*(*v); v++; 1234 #endif 1235 } 1236 jmin++; 1237 } 1238 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1239 for (i=0; i<bs2; i++){ 1240 #if defined(PETSC_USE_COMPLEX) 1241 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1242 #else 1243 sum_off += (*v)*(*v); v++; 1244 #endif 1245 } 1246 } 1247 } 1248 *norm = sqrt(sum_diag + 2*sum_off); 1249 1250 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1251 ierr = PetscMalloc(mbs*sizeof(PetscInt),&il);CHKERRQ(ierr); 1252 ierr = PetscMalloc(mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); 1253 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1254 for (i=0; i<mbs; i++) { 1255 jl[i] = mbs; il[0] = 0; 1256 } 1257 1258 *norm = 0.0; 1259 for (k=0; k<mbs; k++) { /* k_th block row */ 1260 for (j=0; j<bs; j++) sum[j]=0.0; 1261 1262 /*-- col sum --*/ 1263 i = jl[k]; /* first |A(i,k)| to be added */ 1264 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1265 at step k */ 1266 while (i<mbs){ 1267 nexti = jl[i]; /* next block row to be added */ 1268 ik = il[i]; /* block index of A(i,k) in the array a */ 1269 for (j=0; j<bs; j++){ 1270 v = a->a + ik*bs2 + j*bs; 1271 for (k1=0; k1<bs; k1++) { 1272 sum[j] += PetscAbsScalar(*v); v++; 1273 } 1274 } 1275 /* update il, jl */ 1276 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1277 jmax = a->i[i+1]; 1278 if (jmin < jmax){ 1279 il[i] = jmin; 1280 j = a->j[jmin]; 1281 jl[i] = jl[j]; jl[j]=i; 1282 } 1283 i = nexti; 1284 } 1285 1286 /*-- row sum --*/ 1287 jmin = a->i[k]; jmax = a->i[k+1]; 1288 for (i=jmin; i<jmax; i++) { 1289 for (j=0; j<bs; j++){ 1290 v = a->a + i*bs2 + j; 1291 for (k1=0; k1<bs; k1++){ 1292 sum[j] += PetscAbsScalar(*v); 1293 v += bs; 1294 } 1295 } 1296 } 1297 /* add k_th block row to il, jl */ 1298 col = aj+jmin; 1299 if (*col == k) jmin++; 1300 if (jmin < jmax){ 1301 il[k] = jmin; 1302 j = a->j[jmin]; 1303 jl[k] = jl[j]; jl[j] = k; 1304 } 1305 for (j=0; j<bs; j++){ 1306 if (sum[j] > *norm) *norm = sum[j]; 1307 } 1308 } 1309 ierr = PetscFree(il);CHKERRQ(ierr); 1310 ierr = PetscFree(jl);CHKERRQ(ierr); 1311 ierr = PetscFree(sum);CHKERRQ(ierr); 1312 } else { 1313 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1314 } 1315 PetscFunctionReturn(0); 1316 } 1317 1318 #undef __FUNCT__ 1319 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1320 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1321 { 1322 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1323 PetscErrorCode ierr; 1324 1325 PetscFunctionBegin; 1326 1327 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1328 if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) { 1329 *flg = PETSC_FALSE; 1330 PetscFunctionReturn(0); 1331 } 1332 1333 /* if the a->i are the same */ 1334 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1335 if (*flg == PETSC_FALSE) { 1336 PetscFunctionReturn(0); 1337 } 1338 1339 /* if a->j are the same */ 1340 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1341 if (*flg == PETSC_FALSE) { 1342 PetscFunctionReturn(0); 1343 } 1344 /* if a->a are the same */ 1345 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1346 1347 PetscFunctionReturn(0); 1348 } 1349 1350 #undef __FUNCT__ 1351 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1352 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1353 { 1354 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1355 PetscErrorCode ierr; 1356 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1357 PetscScalar *x,zero = 0.0; 1358 MatScalar *aa,*aa_j; 1359 1360 PetscFunctionBegin; 1361 bs = A->bs; 1362 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1363 1364 aa = a->a; 1365 ai = a->i; 1366 aj = a->j; 1367 ambs = a->mbs; 1368 bs2 = a->bs2; 1369 1370 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1371 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1372 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1373 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1374 for (i=0; i<ambs; i++) { 1375 j=ai[i]; 1376 if (aj[j] == i) { /* if this is a diagonal element */ 1377 row = i*bs; 1378 aa_j = aa + j*bs2; 1379 if (A->factor && bs==1){ 1380 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1381 } else { 1382 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1383 } 1384 } 1385 } 1386 1387 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1388 PetscFunctionReturn(0); 1389 } 1390 1391 #undef __FUNCT__ 1392 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1393 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1394 { 1395 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1396 PetscScalar *l,*r,x,*li,*ri; 1397 MatScalar *aa,*v; 1398 PetscErrorCode ierr; 1399 PetscInt i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1400 1401 PetscFunctionBegin; 1402 ai = a->i; 1403 aj = a->j; 1404 aa = a->a; 1405 m = A->m; 1406 bs = A->bs; 1407 mbs = a->mbs; 1408 bs2 = a->bs2; 1409 1410 if (ll != rr) { 1411 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1412 } 1413 if (ll) { 1414 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1415 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1416 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1417 for (i=0; i<mbs; i++) { /* for each block row */ 1418 M = ai[i+1] - ai[i]; 1419 li = l + i*bs; 1420 v = aa + bs2*ai[i]; 1421 for (j=0; j<M; j++) { /* for each block */ 1422 for (k=0; k<bs2; k++) { 1423 (*v++) *= li[k%bs]; 1424 } 1425 #ifdef CONT 1426 /* will be used to replace the above loop */ 1427 ri = l + bs*aj[ai[i]+j]; 1428 for (k=0; k<bs; k++) { /* column value */ 1429 x = ri[k]; 1430 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1431 } 1432 #endif 1433 1434 } 1435 } 1436 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1437 PetscLogFlops(2*a->nz); 1438 } 1439 /* will be deleted */ 1440 if (rr) { 1441 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 1442 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1443 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1444 for (i=0; i<mbs; i++) { /* for each block row */ 1445 M = ai[i+1] - ai[i]; 1446 v = aa + bs2*ai[i]; 1447 for (j=0; j<M; j++) { /* for each block */ 1448 ri = r + bs*aj[ai[i]+j]; 1449 for (k=0; k<bs; k++) { 1450 x = ri[k]; 1451 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1452 } 1453 } 1454 } 1455 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 1456 PetscLogFlops(a->nz); 1457 } 1458 PetscFunctionReturn(0); 1459 } 1460 1461 #undef __FUNCT__ 1462 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1463 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1464 { 1465 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1466 1467 PetscFunctionBegin; 1468 info->rows_global = (double)A->m; 1469 info->columns_global = (double)A->m; 1470 info->rows_local = (double)A->m; 1471 info->columns_local = (double)A->m; 1472 info->block_size = a->bs2; 1473 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1474 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1475 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1476 info->assemblies = A->num_ass; 1477 info->mallocs = a->reallocs; 1478 info->memory = A->mem; 1479 if (A->factor) { 1480 info->fill_ratio_given = A->info.fill_ratio_given; 1481 info->fill_ratio_needed = A->info.fill_ratio_needed; 1482 info->factor_mallocs = A->info.factor_mallocs; 1483 } else { 1484 info->fill_ratio_given = 0; 1485 info->fill_ratio_needed = 0; 1486 info->factor_mallocs = 0; 1487 } 1488 PetscFunctionReturn(0); 1489 } 1490 1491 1492 #undef __FUNCT__ 1493 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1494 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1495 { 1496 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1497 PetscErrorCode ierr; 1498 1499 PetscFunctionBegin; 1500 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1501 PetscFunctionReturn(0); 1502 } 1503 1504 #undef __FUNCT__ 1505 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1506 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1507 { 1508 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1509 PetscErrorCode ierr; 1510 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1511 PetscReal atmp; 1512 MatScalar *aa; 1513 PetscScalar zero = 0.0,*x; 1514 PetscInt ncols,brow,bcol,krow,kcol; 1515 1516 PetscFunctionBegin; 1517 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1518 bs = A->bs; 1519 aa = a->a; 1520 ai = a->i; 1521 aj = a->j; 1522 mbs = a->mbs; 1523 1524 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1525 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1526 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1527 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1528 for (i=0; i<mbs; i++) { 1529 ncols = ai[1] - ai[0]; ai++; 1530 brow = bs*i; 1531 for (j=0; j<ncols; j++){ 1532 bcol = bs*(*aj); 1533 for (kcol=0; kcol<bs; kcol++){ 1534 col = bcol + kcol; /* col index */ 1535 for (krow=0; krow<bs; krow++){ 1536 atmp = PetscAbsScalar(*aa); aa++; 1537 row = brow + krow; /* row index */ 1538 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1539 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1540 } 1541 } 1542 aj++; 1543 } 1544 } 1545 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1546 PetscFunctionReturn(0); 1547 } 1548