1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 27 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n ; ++j){ 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++){ /* for each overlap*/ 50 n = isz; 51 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end ; l++){ 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) 64 nidx2[j*bs+k] = nidx[j]*bs+k; 65 } 66 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 67 } 68 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 69 ierr = PetscFree(nidx);CHKERRQ(ierr); 70 ierr = PetscFree(nidx2);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 #undef __FUNCT__ 75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscTruth flag,sorted; 88 89 PetscFunctionBegin; 90 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 91 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 92 93 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 94 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 97 98 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 99 ssmap = smap; 100 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 101 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 102 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 103 /* determine lens of each row */ 104 for (i=0; i<nrows; i++) { 105 kstart = ai[irow[i]]; 106 kend = kstart + a->ilen[irow[i]]; 107 lens[i] = 0; 108 for (k=kstart; k<kend; k++) { 109 if (ssmap[aj[k]]) { 110 lens[i]++; 111 } 112 } 113 } 114 /* Create and fill new matrix */ 115 if (scall == MAT_REUSE_MATRIX) { 116 c = (Mat_SeqBAIJ *)((*B)->data); 117 118 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 119 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 120 if (!flag) { 121 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 122 } 123 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 124 C = *B; 125 } else { 126 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 127 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 128 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 129 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 130 } 131 c = (Mat_SeqBAIJ *)(C->data); 132 for (i=0; i<nrows; i++) { 133 row = irow[i]; 134 kstart = ai[row]; 135 kend = kstart + a->ilen[row]; 136 mat_i = c->i[i]; 137 mat_j = c->j + mat_i; 138 mat_a = c->a + mat_i*bs2; 139 mat_ilen = c->ilen + i; 140 for (k=kstart; k<kend; k++) { 141 if ((tcol=ssmap[a->j[k]])) { 142 *mat_j++ = tcol - 1; 143 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 144 mat_a += bs2; 145 (*mat_ilen)++; 146 } 147 } 148 } 149 150 /* Free work space */ 151 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 152 ierr = PetscFree(smap);CHKERRQ(ierr); 153 ierr = PetscFree(lens);CHKERRQ(ierr); 154 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 157 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 158 *B = C; 159 PetscFunctionReturn(0); 160 } 161 162 #undef __FUNCT__ 163 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 164 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 165 { 166 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 167 IS is1,is2; 168 PetscErrorCode ierr; 169 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 170 const PetscInt *irow,*icol; 171 172 PetscFunctionBegin; 173 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 174 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 176 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 177 178 /* Verify if the indices corespond to each element in a block 179 and form the IS with compressed IS */ 180 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 181 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 182 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 183 count = 0; 184 for (i=0; i<a->mbs; i++) { 185 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 186 if (vary[i]==bs) iary[count++] = i; 187 } 188 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 189 190 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 191 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 192 count = 0; 193 for (i=0; i<a->mbs; i++) { 194 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc"); 195 if (vary[i]==bs) iary[count++] = i; 196 } 197 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr); 198 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 199 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 200 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 201 202 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 203 ISDestroy(is1); 204 ISDestroy(is2); 205 PetscFunctionReturn(0); 206 } 207 208 #undef __FUNCT__ 209 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 210 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 211 { 212 PetscErrorCode ierr; 213 PetscInt i; 214 215 PetscFunctionBegin; 216 if (scall == MAT_INITIAL_MATRIX) { 217 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 218 } 219 220 for (i=0; i<n; i++) { 221 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 227 /* -------------------------------------------------------*/ 228 /* Should check that shapes of vectors and matrices match */ 229 /* -------------------------------------------------------*/ 230 231 #undef __FUNCT__ 232 #define __FUNCT__ "MatMult_SeqBAIJ_1" 233 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 234 { 235 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 236 PetscScalar *z,sum; 237 const PetscScalar *x; 238 const MatScalar *v; 239 PetscErrorCode ierr; 240 PetscInt mbs,i,n,nonzerorow=0; 241 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 242 PetscTruth usecprow=a->compressedrow.use; 243 244 PetscFunctionBegin; 245 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 246 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 247 248 if (usecprow){ 249 mbs = a->compressedrow.nrows; 250 ii = a->compressedrow.i; 251 ridx = a->compressedrow.rindex; 252 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 253 } else { 254 mbs = a->mbs; 255 ii = a->i; 256 } 257 258 for (i=0; i<mbs; i++) { 259 n = ii[1] - ii[0]; 260 v = a->a + ii[0]; 261 idx = a->j + ii[0]; 262 ii++; 263 sum = 0.0; 264 PetscSparseDensePlusDot(sum,x,v,idx,n); 265 if (usecprow){ 266 z[ridx[i]] = sum; 267 } else { 268 nonzerorow += (n>0); 269 z[i] = sum; 270 } 271 } 272 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 273 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 274 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 275 PetscFunctionReturn(0); 276 } 277 278 #undef __FUNCT__ 279 #define __FUNCT__ "MatMult_SeqBAIJ_2" 280 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 281 { 282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 283 PetscScalar *z = 0,sum1,sum2,*zarray; 284 const PetscScalar *x,*xb; 285 PetscScalar x1,x2; 286 const MatScalar *v; 287 PetscErrorCode ierr; 288 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 289 PetscTruth usecprow=a->compressedrow.use; 290 291 PetscFunctionBegin; 292 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 293 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 294 295 idx = a->j; 296 v = a->a; 297 if (usecprow){ 298 mbs = a->compressedrow.nrows; 299 ii = a->compressedrow.i; 300 ridx = a->compressedrow.rindex; 301 } else { 302 mbs = a->mbs; 303 ii = a->i; 304 z = zarray; 305 } 306 307 for (i=0; i<mbs; i++) { 308 n = ii[1] - ii[0]; ii++; 309 sum1 = 0.0; sum2 = 0.0; 310 nonzerorow += (n>0); 311 for (j=0; j<n; j++) { 312 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 313 sum1 += v[0]*x1 + v[2]*x2; 314 sum2 += v[1]*x1 + v[3]*x2; 315 v += 4; 316 } 317 if (usecprow) z = zarray + 2*ridx[i]; 318 z[0] = sum1; z[1] = sum2; 319 if (!usecprow) z += 2; 320 } 321 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 322 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 323 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 #undef __FUNCT__ 328 #define __FUNCT__ "MatMult_SeqBAIJ_3" 329 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 330 { 331 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 332 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 333 const PetscScalar *x,*xb; 334 const MatScalar *v; 335 PetscErrorCode ierr; 336 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 337 PetscTruth usecprow=a->compressedrow.use; 338 339 340 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 341 #pragma disjoint(*v,*z,*xb) 342 #endif 343 344 PetscFunctionBegin; 345 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 346 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 347 348 idx = a->j; 349 v = a->a; 350 if (usecprow){ 351 mbs = a->compressedrow.nrows; 352 ii = a->compressedrow.i; 353 ridx = a->compressedrow.rindex; 354 } else { 355 mbs = a->mbs; 356 ii = a->i; 357 z = zarray; 358 } 359 360 for (i=0; i<mbs; i++) { 361 n = ii[1] - ii[0]; ii++; 362 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 363 nonzerorow += (n>0); 364 for (j=0; j<n; j++) { 365 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 366 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 367 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 368 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 369 v += 9; 370 } 371 if (usecprow) z = zarray + 3*ridx[i]; 372 z[0] = sum1; z[1] = sum2; z[2] = sum3; 373 if (!usecprow) z += 3; 374 } 375 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 376 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 377 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 378 PetscFunctionReturn(0); 379 } 380 381 #undef __FUNCT__ 382 #define __FUNCT__ "MatMult_SeqBAIJ_4" 383 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 384 { 385 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 386 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 387 const PetscScalar *x,*xb; 388 const MatScalar *v; 389 PetscErrorCode ierr; 390 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 391 PetscTruth usecprow=a->compressedrow.use; 392 393 PetscFunctionBegin; 394 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 395 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 396 397 idx = a->j; 398 v = a->a; 399 if (usecprow){ 400 mbs = a->compressedrow.nrows; 401 ii = a->compressedrow.i; 402 ridx = a->compressedrow.rindex; 403 } else { 404 mbs = a->mbs; 405 ii = a->i; 406 z = zarray; 407 } 408 409 for (i=0; i<mbs; i++) { 410 n = ii[1] - ii[0]; ii++; 411 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 412 nonzerorow += (n>0); 413 for (j=0; j<n; j++) { 414 xb = x + 4*(*idx++); 415 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 416 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 417 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 418 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 419 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 420 v += 16; 421 } 422 if (usecprow) z = zarray + 4*ridx[i]; 423 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 424 if (!usecprow) z += 4; 425 } 426 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 427 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 428 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 429 PetscFunctionReturn(0); 430 } 431 432 #undef __FUNCT__ 433 #define __FUNCT__ "MatMult_SeqBAIJ_5" 434 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 435 { 436 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 437 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 438 const PetscScalar *xb,*x; 439 const MatScalar *v; 440 PetscErrorCode ierr; 441 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 442 PetscInt mbs,i,j,n,nonzerorow=0; 443 PetscTruth usecprow=a->compressedrow.use; 444 445 PetscFunctionBegin; 446 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 447 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 448 449 idx = a->j; 450 v = a->a; 451 if (usecprow){ 452 mbs = a->compressedrow.nrows; 453 ii = a->compressedrow.i; 454 ridx = a->compressedrow.rindex; 455 } else { 456 mbs = a->mbs; 457 ii = a->i; 458 z = zarray; 459 } 460 461 for (i=0; i<mbs; i++) { 462 n = ii[1] - ii[0]; ii++; 463 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 464 nonzerorow += (n>0); 465 for (j=0; j<n; j++) { 466 xb = x + 5*(*idx++); 467 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 468 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 469 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 470 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 471 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 472 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 473 v += 25; 474 } 475 if (usecprow) z = zarray + 5*ridx[i]; 476 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 477 if (!usecprow) z += 5; 478 } 479 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 480 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 481 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 482 PetscFunctionReturn(0); 483 } 484 485 486 #undef __FUNCT__ 487 #define __FUNCT__ "MatMult_SeqBAIJ_6" 488 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 489 { 490 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 491 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 492 const PetscScalar *x,*xb; 493 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 494 const MatScalar *v; 495 PetscErrorCode ierr; 496 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 497 PetscTruth usecprow=a->compressedrow.use; 498 499 PetscFunctionBegin; 500 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 501 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 502 503 idx = a->j; 504 v = a->a; 505 if (usecprow){ 506 mbs = a->compressedrow.nrows; 507 ii = a->compressedrow.i; 508 ridx = a->compressedrow.rindex; 509 } else { 510 mbs = a->mbs; 511 ii = a->i; 512 z = zarray; 513 } 514 515 for (i=0; i<mbs; i++) { 516 n = ii[1] - ii[0]; ii++; 517 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 518 nonzerorow += (n>0); 519 for (j=0; j<n; j++) { 520 xb = x + 6*(*idx++); 521 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 522 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 523 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 524 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 525 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 526 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 527 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 528 v += 36; 529 } 530 if (usecprow) z = zarray + 6*ridx[i]; 531 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 532 if (!usecprow) z += 6; 533 } 534 535 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 536 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 537 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 538 PetscFunctionReturn(0); 539 } 540 541 #undef __FUNCT__ 542 #define __FUNCT__ "MatMult_SeqBAIJ_7" 543 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 544 { 545 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 546 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 547 const PetscScalar *x,*xb; 548 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 549 const MatScalar *v; 550 PetscErrorCode ierr; 551 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 552 PetscTruth usecprow=a->compressedrow.use; 553 554 PetscFunctionBegin; 555 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 556 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 557 558 idx = a->j; 559 v = a->a; 560 if (usecprow){ 561 mbs = a->compressedrow.nrows; 562 ii = a->compressedrow.i; 563 ridx = a->compressedrow.rindex; 564 } else { 565 mbs = a->mbs; 566 ii = a->i; 567 z = zarray; 568 } 569 570 for (i=0; i<mbs; i++) { 571 n = ii[1] - ii[0]; ii++; 572 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 573 nonzerorow += (n>0); 574 for (j=0; j<n; j++) { 575 xb = x + 7*(*idx++); 576 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 577 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 578 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 579 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 580 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 581 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 582 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 583 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 584 v += 49; 585 } 586 if (usecprow) z = zarray + 7*ridx[i]; 587 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 588 if (!usecprow) z += 7; 589 } 590 591 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 592 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 593 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 594 PetscFunctionReturn(0); 595 } 596 597 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 598 /* Default MatMult for block size 15 */ 599 600 #undef __FUNCT__ 601 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 602 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 603 { 604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 605 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 606 const PetscScalar *x,*xb; 607 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 608 const MatScalar *v; 609 PetscErrorCode ierr; 610 const PetscInt *ii,*ij=a->j,*idx; 611 PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0; 612 PetscTruth usecprow=a->compressedrow.use; 613 614 PetscFunctionBegin; 615 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 616 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 617 618 v = a->a; 619 if (usecprow){ 620 mbs = a->compressedrow.nrows; 621 ii = a->compressedrow.i; 622 ridx = a->compressedrow.rindex; 623 } else { 624 mbs = a->mbs; 625 ii = a->i; 626 z = zarray; 627 } 628 629 for (i=0; i<mbs; i++) { 630 n = ii[i+1] - ii[i]; 631 idx = ij + ii[i]; 632 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 633 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 634 635 nonzerorow += (n>0); 636 for (j=0; j<n; j++) { 637 xb = x + 15*(idx[j]); 638 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 639 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 640 641 for(k=0;k<15;k++){ 642 sum1 += v[0]*xb[k]; 643 sum2 += v[1]*xb[k]; 644 sum3 += v[2]*xb[k]; 645 sum4 += v[3]*xb[k]; 646 sum5 += v[4]*xb[k]; 647 sum6 += v[5]*xb[k]; 648 sum7 += v[6]*xb[k]; 649 sum8 += v[7]*xb[k]; 650 sum9 += v[8]*xb[k]; 651 sum10 += v[9]*xb[k]; 652 sum11 += v[10]*xb[k]; 653 sum12 += v[11]*xb[k]; 654 sum13 += v[12]*xb[k]; 655 sum14 += v[13]*xb[k]; 656 sum15 += v[14]*xb[k]; 657 v += 15; 658 } 659 } 660 if (usecprow) z = zarray + 15*ridx[i]; 661 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 662 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 663 664 if (!usecprow) z += 15; 665 } 666 667 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 668 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 669 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 670 PetscFunctionReturn(0); 671 } 672 673 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 674 #undef __FUNCT__ 675 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 676 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 677 { 678 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 679 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 680 const PetscScalar *x,*xb; 681 PetscScalar x1,x2,x3,x4,*zarray; 682 const MatScalar *v; 683 PetscErrorCode ierr; 684 const PetscInt *ii,*ij=a->j,*idx; 685 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 686 PetscTruth usecprow=a->compressedrow.use; 687 688 PetscFunctionBegin; 689 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 690 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 691 692 v = a->a; 693 if (usecprow){ 694 mbs = a->compressedrow.nrows; 695 ii = a->compressedrow.i; 696 ridx = a->compressedrow.rindex; 697 } else { 698 mbs = a->mbs; 699 ii = a->i; 700 z = zarray; 701 } 702 703 for (i=0; i<mbs; i++) { 704 n = ii[i+1] - ii[i]; 705 idx = ij + ii[i]; 706 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 707 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 708 709 nonzerorow += (n>0); 710 for (j=0; j<n; j++) { 711 xb = x + 15*(idx[j]); 712 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 713 714 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 715 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 716 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 717 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 718 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 719 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 720 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 721 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 722 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 723 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 724 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 725 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 726 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 727 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 728 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 729 730 v += 60; 731 732 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 733 734 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 735 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 736 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 737 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 738 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 739 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 740 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 741 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 742 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 743 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 744 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 745 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 746 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 747 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 748 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 749 v += 60; 750 751 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 752 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 753 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 754 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 755 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 756 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 757 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 758 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 759 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 760 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 761 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 762 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 763 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 764 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 765 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 766 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 767 v += 60; 768 769 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 770 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 771 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 772 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 773 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 774 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 775 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 776 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 777 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 778 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 779 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 780 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 781 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 782 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 783 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 784 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 785 v += 45; 786 } 787 if (usecprow) z = zarray + 15*ridx[i]; 788 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 789 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 790 791 if (!usecprow) z += 15; 792 } 793 794 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 795 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 796 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 797 PetscFunctionReturn(0); 798 } 799 800 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 801 #undef __FUNCT__ 802 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 803 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 804 { 805 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 806 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 807 const PetscScalar *x,*xb; 808 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 809 const MatScalar *v; 810 PetscErrorCode ierr; 811 const PetscInt *ii,*ij=a->j,*idx; 812 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 813 PetscTruth usecprow=a->compressedrow.use; 814 815 PetscFunctionBegin; 816 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 817 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 818 819 v = a->a; 820 if (usecprow){ 821 mbs = a->compressedrow.nrows; 822 ii = a->compressedrow.i; 823 ridx = a->compressedrow.rindex; 824 } else { 825 mbs = a->mbs; 826 ii = a->i; 827 z = zarray; 828 } 829 830 for (i=0; i<mbs; i++) { 831 n = ii[i+1] - ii[i]; 832 idx = ij + ii[i]; 833 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 834 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 835 836 nonzerorow += (n>0); 837 for (j=0; j<n; j++) { 838 xb = x + 15*(idx[j]); 839 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 840 x8 = xb[7]; 841 842 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 843 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 844 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 845 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 846 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 847 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 848 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 849 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 850 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 851 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 852 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 853 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 854 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 855 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 856 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 857 v += 120; 858 859 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 860 861 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 862 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 863 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 864 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 865 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 866 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 867 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 868 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 869 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 870 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 871 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 872 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 873 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 874 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 875 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 876 v += 105; 877 } 878 if (usecprow) z = zarray + 15*ridx[i]; 879 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 880 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 881 882 if (!usecprow) z += 15; 883 } 884 885 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 886 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 887 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 888 PetscFunctionReturn(0); 889 } 890 891 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 892 893 #undef __FUNCT__ 894 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 895 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 896 { 897 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 898 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 899 const PetscScalar *x,*xb; 900 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 901 const MatScalar *v; 902 PetscErrorCode ierr; 903 const PetscInt *ii,*ij=a->j,*idx; 904 PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0; 905 PetscTruth usecprow=a->compressedrow.use; 906 907 PetscFunctionBegin; 908 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 909 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 910 911 v = a->a; 912 if (usecprow){ 913 mbs = a->compressedrow.nrows; 914 ii = a->compressedrow.i; 915 ridx = a->compressedrow.rindex; 916 } else { 917 mbs = a->mbs; 918 ii = a->i; 919 z = zarray; 920 } 921 922 for (i=0; i<mbs; i++) { 923 n = ii[i+1] - ii[i]; 924 idx = ij + ii[i]; 925 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 926 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 927 928 nonzerorow += (n>0); 929 for (j=0; j<n; j++) { 930 xb = x + 15*(idx[j]); 931 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 932 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 933 934 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 935 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 936 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 937 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 938 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 939 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 940 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 941 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 942 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 943 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 944 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 945 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 946 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 947 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 948 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 949 v += 225; 950 } 951 if (usecprow) z = zarray + 15*ridx[i]; 952 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 953 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 954 955 if (!usecprow) z += 15; 956 } 957 958 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 959 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 960 ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr); 961 PetscFunctionReturn(0); 962 } 963 964 965 /* 966 This will not work with MatScalar == float because it calls the BLAS 967 */ 968 #undef __FUNCT__ 969 #define __FUNCT__ "MatMult_SeqBAIJ_N" 970 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 971 { 972 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 973 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 974 MatScalar *v; 975 PetscErrorCode ierr; 976 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 977 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 978 PetscTruth usecprow=a->compressedrow.use; 979 980 PetscFunctionBegin; 981 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 982 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 983 984 idx = a->j; 985 v = a->a; 986 if (usecprow){ 987 mbs = a->compressedrow.nrows; 988 ii = a->compressedrow.i; 989 ridx = a->compressedrow.rindex; 990 } else { 991 mbs = a->mbs; 992 ii = a->i; 993 z = zarray; 994 } 995 996 if (!a->mult_work) { 997 k = PetscMax(A->rmap->n,A->cmap->n); 998 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 999 } 1000 work = a->mult_work; 1001 for (i=0; i<mbs; i++) { 1002 n = ii[1] - ii[0]; ii++; 1003 ncols = n*bs; 1004 workt = work; 1005 nonzerorow += (n>0); 1006 for (j=0; j<n; j++) { 1007 xb = x + bs*(*idx++); 1008 for (k=0; k<bs; k++) workt[k] = xb[k]; 1009 workt += bs; 1010 } 1011 if (usecprow) z = zarray + bs*ridx[i]; 1012 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1013 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1014 v += n*bs2; 1015 if (!usecprow) z += bs; 1016 } 1017 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1018 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1019 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 1020 PetscFunctionReturn(0); 1021 } 1022 1023 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 1024 #undef __FUNCT__ 1025 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1026 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1027 { 1028 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1029 const PetscScalar *x; 1030 PetscScalar *y,*z,sum; 1031 const MatScalar *v; 1032 PetscErrorCode ierr; 1033 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 1034 const PetscInt *idx,*ii; 1035 PetscTruth usecprow=a->compressedrow.use; 1036 1037 PetscFunctionBegin; 1038 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1039 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1040 if (zz != yy) { 1041 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1042 } else { 1043 z = y; 1044 } 1045 1046 idx = a->j; 1047 v = a->a; 1048 if (usecprow){ 1049 if (zz != yy){ 1050 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1051 } 1052 mbs = a->compressedrow.nrows; 1053 ii = a->compressedrow.i; 1054 ridx = a->compressedrow.rindex; 1055 } else { 1056 ii = a->i; 1057 } 1058 1059 for (i=0; i<mbs; i++) { 1060 n = ii[1] - ii[0]; 1061 ii++; 1062 if (!usecprow){ 1063 nonzerorow += (n>0); 1064 sum = y[i]; 1065 } else { 1066 sum = y[ridx[i]]; 1067 } 1068 PetscSparseDensePlusDot(sum,x,v,idx,n); 1069 v += n; 1070 idx += n; 1071 if (usecprow){ 1072 z[ridx[i]] = sum; 1073 } else { 1074 z[i] = sum; 1075 } 1076 } 1077 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 1078 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1079 if (zz != yy) { 1080 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1081 } 1082 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 1083 PetscFunctionReturn(0); 1084 } 1085 1086 #undef __FUNCT__ 1087 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1088 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1089 { 1090 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1091 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1092 PetscScalar x1,x2,*yarray,*zarray; 1093 MatScalar *v; 1094 PetscErrorCode ierr; 1095 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1096 PetscTruth usecprow=a->compressedrow.use; 1097 1098 PetscFunctionBegin; 1099 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1100 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1101 if (zz != yy) { 1102 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1103 } else { 1104 zarray = yarray; 1105 } 1106 1107 idx = a->j; 1108 v = a->a; 1109 if (usecprow){ 1110 if (zz != yy){ 1111 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1112 } 1113 mbs = a->compressedrow.nrows; 1114 ii = a->compressedrow.i; 1115 ridx = a->compressedrow.rindex; 1116 if (zz != yy){ 1117 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1118 } 1119 } else { 1120 ii = a->i; 1121 y = yarray; 1122 z = zarray; 1123 } 1124 1125 for (i=0; i<mbs; i++) { 1126 n = ii[1] - ii[0]; ii++; 1127 if (usecprow){ 1128 z = zarray + 2*ridx[i]; 1129 y = yarray + 2*ridx[i]; 1130 } 1131 sum1 = y[0]; sum2 = y[1]; 1132 for (j=0; j<n; j++) { 1133 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 1134 sum1 += v[0]*x1 + v[2]*x2; 1135 sum2 += v[1]*x1 + v[3]*x2; 1136 v += 4; 1137 } 1138 z[0] = sum1; z[1] = sum2; 1139 if (!usecprow){ 1140 z += 2; y += 2; 1141 } 1142 } 1143 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1144 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1145 if (zz != yy) { 1146 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1147 } 1148 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1149 PetscFunctionReturn(0); 1150 } 1151 1152 #undef __FUNCT__ 1153 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1154 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1155 { 1156 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1157 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1158 MatScalar *v; 1159 PetscErrorCode ierr; 1160 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1161 PetscTruth usecprow=a->compressedrow.use; 1162 1163 PetscFunctionBegin; 1164 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1165 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1166 if (zz != yy) { 1167 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1168 } else { 1169 zarray = yarray; 1170 } 1171 1172 idx = a->j; 1173 v = a->a; 1174 if (usecprow){ 1175 if (zz != yy){ 1176 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1177 } 1178 mbs = a->compressedrow.nrows; 1179 ii = a->compressedrow.i; 1180 ridx = a->compressedrow.rindex; 1181 } else { 1182 ii = a->i; 1183 y = yarray; 1184 z = zarray; 1185 } 1186 1187 for (i=0; i<mbs; i++) { 1188 n = ii[1] - ii[0]; ii++; 1189 if (usecprow){ 1190 z = zarray + 3*ridx[i]; 1191 y = yarray + 3*ridx[i]; 1192 } 1193 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1194 for (j=0; j<n; j++) { 1195 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1196 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1197 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1198 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1199 v += 9; 1200 } 1201 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1202 if (!usecprow){ 1203 z += 3; y += 3; 1204 } 1205 } 1206 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1207 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1208 if (zz != yy) { 1209 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1210 } 1211 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1212 PetscFunctionReturn(0); 1213 } 1214 1215 #undef __FUNCT__ 1216 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1217 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1218 { 1219 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1220 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1221 MatScalar *v; 1222 PetscErrorCode ierr; 1223 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1224 PetscTruth usecprow=a->compressedrow.use; 1225 1226 PetscFunctionBegin; 1227 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1228 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1229 if (zz != yy) { 1230 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1231 } else { 1232 zarray = yarray; 1233 } 1234 1235 idx = a->j; 1236 v = a->a; 1237 if (usecprow){ 1238 if (zz != yy){ 1239 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1240 } 1241 mbs = a->compressedrow.nrows; 1242 ii = a->compressedrow.i; 1243 ridx = a->compressedrow.rindex; 1244 } else { 1245 ii = a->i; 1246 y = yarray; 1247 z = zarray; 1248 } 1249 1250 for (i=0; i<mbs; i++) { 1251 n = ii[1] - ii[0]; ii++; 1252 if (usecprow){ 1253 z = zarray + 4*ridx[i]; 1254 y = yarray + 4*ridx[i]; 1255 } 1256 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1257 for (j=0; j<n; j++) { 1258 xb = x + 4*(*idx++); 1259 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1260 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1261 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1262 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1263 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1264 v += 16; 1265 } 1266 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1267 if (!usecprow){ 1268 z += 4; y += 4; 1269 } 1270 } 1271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1272 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1273 if (zz != yy) { 1274 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1275 } 1276 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1277 PetscFunctionReturn(0); 1278 } 1279 1280 #undef __FUNCT__ 1281 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1282 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1283 { 1284 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1285 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1286 PetscScalar *yarray,*zarray; 1287 MatScalar *v; 1288 PetscErrorCode ierr; 1289 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1290 PetscTruth usecprow=a->compressedrow.use; 1291 1292 PetscFunctionBegin; 1293 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1294 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1295 if (zz != yy) { 1296 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1297 } else { 1298 zarray = yarray; 1299 } 1300 1301 idx = a->j; 1302 v = a->a; 1303 if (usecprow){ 1304 if (zz != yy){ 1305 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1306 } 1307 mbs = a->compressedrow.nrows; 1308 ii = a->compressedrow.i; 1309 ridx = a->compressedrow.rindex; 1310 } else { 1311 ii = a->i; 1312 y = yarray; 1313 z = zarray; 1314 } 1315 1316 for (i=0; i<mbs; i++) { 1317 n = ii[1] - ii[0]; ii++; 1318 if (usecprow){ 1319 z = zarray + 5*ridx[i]; 1320 y = yarray + 5*ridx[i]; 1321 } 1322 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1323 for (j=0; j<n; j++) { 1324 xb = x + 5*(*idx++); 1325 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1326 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1327 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1328 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1329 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1330 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1331 v += 25; 1332 } 1333 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1334 if (!usecprow){ 1335 z += 5; y += 5; 1336 } 1337 } 1338 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1339 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1340 if (zz != yy) { 1341 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1342 } 1343 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1344 PetscFunctionReturn(0); 1345 } 1346 #undef __FUNCT__ 1347 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1348 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1349 { 1350 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1351 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1352 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1353 MatScalar *v; 1354 PetscErrorCode ierr; 1355 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1356 PetscTruth usecprow=a->compressedrow.use; 1357 1358 PetscFunctionBegin; 1359 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1360 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1361 if (zz != yy) { 1362 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1363 } else { 1364 zarray = yarray; 1365 } 1366 1367 idx = a->j; 1368 v = a->a; 1369 if (usecprow){ 1370 if (zz != yy){ 1371 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1372 } 1373 mbs = a->compressedrow.nrows; 1374 ii = a->compressedrow.i; 1375 ridx = a->compressedrow.rindex; 1376 } else { 1377 ii = a->i; 1378 y = yarray; 1379 z = zarray; 1380 } 1381 1382 for (i=0; i<mbs; i++) { 1383 n = ii[1] - ii[0]; ii++; 1384 if (usecprow){ 1385 z = zarray + 6*ridx[i]; 1386 y = yarray + 6*ridx[i]; 1387 } 1388 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1389 for (j=0; j<n; j++) { 1390 xb = x + 6*(*idx++); 1391 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1392 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1393 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1394 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1395 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1396 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1397 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1398 v += 36; 1399 } 1400 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1401 if (!usecprow){ 1402 z += 6; y += 6; 1403 } 1404 } 1405 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1406 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1407 if (zz != yy) { 1408 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1409 } 1410 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1411 PetscFunctionReturn(0); 1412 } 1413 1414 #undef __FUNCT__ 1415 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1416 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1417 { 1418 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1419 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1420 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1421 MatScalar *v; 1422 PetscErrorCode ierr; 1423 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1424 PetscTruth usecprow=a->compressedrow.use; 1425 1426 PetscFunctionBegin; 1427 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1428 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1429 if (zz != yy) { 1430 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1431 } else { 1432 zarray = yarray; 1433 } 1434 1435 idx = a->j; 1436 v = a->a; 1437 if (usecprow){ 1438 if (zz != yy){ 1439 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1440 } 1441 mbs = a->compressedrow.nrows; 1442 ii = a->compressedrow.i; 1443 ridx = a->compressedrow.rindex; 1444 } else { 1445 ii = a->i; 1446 y = yarray; 1447 z = zarray; 1448 } 1449 1450 for (i=0; i<mbs; i++) { 1451 n = ii[1] - ii[0]; ii++; 1452 if (usecprow){ 1453 z = zarray + 7*ridx[i]; 1454 y = yarray + 7*ridx[i]; 1455 } 1456 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1457 for (j=0; j<n; j++) { 1458 xb = x + 7*(*idx++); 1459 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1460 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1461 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1462 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1463 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1464 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1465 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1466 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1467 v += 49; 1468 } 1469 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1470 if (!usecprow){ 1471 z += 7; y += 7; 1472 } 1473 } 1474 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1475 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1476 if (zz != yy) { 1477 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1478 } 1479 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1480 PetscFunctionReturn(0); 1481 } 1482 1483 #undef __FUNCT__ 1484 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1485 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1486 { 1487 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1488 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1489 MatScalar *v; 1490 PetscErrorCode ierr; 1491 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1492 PetscInt ncols,k,*ridx=PETSC_NULL; 1493 PetscTruth usecprow=a->compressedrow.use; 1494 1495 PetscFunctionBegin; 1496 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1497 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1498 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1499 1500 idx = a->j; 1501 v = a->a; 1502 if (usecprow){ 1503 mbs = a->compressedrow.nrows; 1504 ii = a->compressedrow.i; 1505 ridx = a->compressedrow.rindex; 1506 } else { 1507 mbs = a->mbs; 1508 ii = a->i; 1509 z = zarray; 1510 } 1511 1512 if (!a->mult_work) { 1513 k = PetscMax(A->rmap->n,A->cmap->n); 1514 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1515 } 1516 work = a->mult_work; 1517 for (i=0; i<mbs; i++) { 1518 n = ii[1] - ii[0]; ii++; 1519 ncols = n*bs; 1520 workt = work; 1521 for (j=0; j<n; j++) { 1522 xb = x + bs*(*idx++); 1523 for (k=0; k<bs; k++) workt[k] = xb[k]; 1524 workt += bs; 1525 } 1526 if (usecprow) z = zarray + bs*ridx[i]; 1527 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1528 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1529 v += n*bs2; 1530 if (!usecprow){ 1531 z += bs; 1532 } 1533 } 1534 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1536 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1537 PetscFunctionReturn(0); 1538 } 1539 1540 #undef __FUNCT__ 1541 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1542 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1543 { 1544 PetscScalar zero = 0.0; 1545 PetscErrorCode ierr; 1546 1547 PetscFunctionBegin; 1548 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1549 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1550 PetscFunctionReturn(0); 1551 } 1552 1553 #undef __FUNCT__ 1554 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1555 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1556 { 1557 PetscScalar zero = 0.0; 1558 PetscErrorCode ierr; 1559 1560 PetscFunctionBegin; 1561 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1562 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1563 PetscFunctionReturn(0); 1564 } 1565 1566 #undef __FUNCT__ 1567 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1568 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1569 1570 { 1571 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1572 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1573 MatScalar *v; 1574 PetscErrorCode ierr; 1575 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1576 Mat_CompressedRow cprow = a->compressedrow; 1577 PetscTruth usecprow=cprow.use; 1578 1579 PetscFunctionBegin; 1580 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1581 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1582 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1583 1584 idx = a->j; 1585 v = a->a; 1586 if (usecprow){ 1587 mbs = cprow.nrows; 1588 ii = cprow.i; 1589 ridx = cprow.rindex; 1590 } else { 1591 mbs=a->mbs; 1592 ii = a->i; 1593 xb = x; 1594 } 1595 1596 switch (bs) { 1597 case 1: 1598 for (i=0; i<mbs; i++) { 1599 if (usecprow) xb = x + ridx[i]; 1600 x1 = xb[0]; 1601 ib = idx + ii[0]; 1602 n = ii[1] - ii[0]; ii++; 1603 for (j=0; j<n; j++) { 1604 rval = ib[j]; 1605 z[rval] += PetscConj(*v) * x1; 1606 v++; 1607 } 1608 if (!usecprow) xb++; 1609 } 1610 break; 1611 case 2: 1612 for (i=0; i<mbs; i++) { 1613 if (usecprow) xb = x + 2*ridx[i]; 1614 x1 = xb[0]; x2 = xb[1]; 1615 ib = idx + ii[0]; 1616 n = ii[1] - ii[0]; ii++; 1617 for (j=0; j<n; j++) { 1618 rval = ib[j]*2; 1619 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1620 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1621 v += 4; 1622 } 1623 if (!usecprow) xb += 2; 1624 } 1625 break; 1626 case 3: 1627 for (i=0; i<mbs; i++) { 1628 if (usecprow) xb = x + 3*ridx[i]; 1629 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1630 ib = idx + ii[0]; 1631 n = ii[1] - ii[0]; ii++; 1632 for (j=0; j<n; j++) { 1633 rval = ib[j]*3; 1634 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1635 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1636 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1637 v += 9; 1638 } 1639 if (!usecprow) xb += 3; 1640 } 1641 break; 1642 case 4: 1643 for (i=0; i<mbs; i++) { 1644 if (usecprow) xb = x + 4*ridx[i]; 1645 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1646 ib = idx + ii[0]; 1647 n = ii[1] - ii[0]; ii++; 1648 for (j=0; j<n; j++) { 1649 rval = ib[j]*4; 1650 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1651 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1652 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1653 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1654 v += 16; 1655 } 1656 if (!usecprow) xb += 4; 1657 } 1658 break; 1659 case 5: 1660 for (i=0; i<mbs; i++) { 1661 if (usecprow) xb = x + 5*ridx[i]; 1662 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1663 x4 = xb[3]; x5 = xb[4]; 1664 ib = idx + ii[0]; 1665 n = ii[1] - ii[0]; ii++; 1666 for (j=0; j<n; j++) { 1667 rval = ib[j]*5; 1668 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1669 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1670 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1671 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1672 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1673 v += 25; 1674 } 1675 if (!usecprow) xb += 5; 1676 } 1677 break; 1678 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1679 PetscInt ncols,k; 1680 PetscScalar *work,*workt,*xtmp; 1681 1682 SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1683 if (!a->mult_work) { 1684 k = PetscMax(A->rmap->n,A->cmap->n); 1685 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1686 } 1687 work = a->mult_work; 1688 xtmp = x; 1689 for (i=0; i<mbs; i++) { 1690 n = ii[1] - ii[0]; ii++; 1691 ncols = n*bs; 1692 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1693 if (usecprow) { 1694 xtmp = x + bs*ridx[i]; 1695 } 1696 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1697 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1698 v += n*bs2; 1699 if (!usecprow) xtmp += bs; 1700 workt = work; 1701 for (j=0; j<n; j++) { 1702 zb = z + bs*(*idx++); 1703 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1704 workt += bs; 1705 } 1706 } 1707 } 1708 } 1709 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1710 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1711 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1712 PetscFunctionReturn(0); 1713 } 1714 1715 #undef __FUNCT__ 1716 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1717 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1718 { 1719 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1720 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1721 MatScalar *v; 1722 PetscErrorCode ierr; 1723 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1724 Mat_CompressedRow cprow = a->compressedrow; 1725 PetscTruth usecprow=cprow.use; 1726 1727 PetscFunctionBegin; 1728 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1729 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1730 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1731 1732 idx = a->j; 1733 v = a->a; 1734 if (usecprow){ 1735 mbs = cprow.nrows; 1736 ii = cprow.i; 1737 ridx = cprow.rindex; 1738 } else { 1739 mbs=a->mbs; 1740 ii = a->i; 1741 xb = x; 1742 } 1743 1744 switch (bs) { 1745 case 1: 1746 for (i=0; i<mbs; i++) { 1747 if (usecprow) xb = x + ridx[i]; 1748 x1 = xb[0]; 1749 ib = idx + ii[0]; 1750 n = ii[1] - ii[0]; ii++; 1751 for (j=0; j<n; j++) { 1752 rval = ib[j]; 1753 z[rval] += *v * x1; 1754 v++; 1755 } 1756 if (!usecprow) xb++; 1757 } 1758 break; 1759 case 2: 1760 for (i=0; i<mbs; i++) { 1761 if (usecprow) xb = x + 2*ridx[i]; 1762 x1 = xb[0]; x2 = xb[1]; 1763 ib = idx + ii[0]; 1764 n = ii[1] - ii[0]; ii++; 1765 for (j=0; j<n; j++) { 1766 rval = ib[j]*2; 1767 z[rval++] += v[0]*x1 + v[1]*x2; 1768 z[rval++] += v[2]*x1 + v[3]*x2; 1769 v += 4; 1770 } 1771 if (!usecprow) xb += 2; 1772 } 1773 break; 1774 case 3: 1775 for (i=0; i<mbs; i++) { 1776 if (usecprow) xb = x + 3*ridx[i]; 1777 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1778 ib = idx + ii[0]; 1779 n = ii[1] - ii[0]; ii++; 1780 for (j=0; j<n; j++) { 1781 rval = ib[j]*3; 1782 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1783 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1784 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1785 v += 9; 1786 } 1787 if (!usecprow) xb += 3; 1788 } 1789 break; 1790 case 4: 1791 for (i=0; i<mbs; i++) { 1792 if (usecprow) xb = x + 4*ridx[i]; 1793 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1794 ib = idx + ii[0]; 1795 n = ii[1] - ii[0]; ii++; 1796 for (j=0; j<n; j++) { 1797 rval = ib[j]*4; 1798 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1799 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1800 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1801 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1802 v += 16; 1803 } 1804 if (!usecprow) xb += 4; 1805 } 1806 break; 1807 case 5: 1808 for (i=0; i<mbs; i++) { 1809 if (usecprow) xb = x + 5*ridx[i]; 1810 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1811 x4 = xb[3]; x5 = xb[4]; 1812 ib = idx + ii[0]; 1813 n = ii[1] - ii[0]; ii++; 1814 for (j=0; j<n; j++) { 1815 rval = ib[j]*5; 1816 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1817 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1818 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1819 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1820 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1821 v += 25; 1822 } 1823 if (!usecprow) xb += 5; 1824 } 1825 break; 1826 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1827 PetscInt ncols,k; 1828 PetscScalar *work,*workt,*xtmp; 1829 1830 if (!a->mult_work) { 1831 k = PetscMax(A->rmap->n,A->cmap->n); 1832 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1833 } 1834 work = a->mult_work; 1835 xtmp = x; 1836 for (i=0; i<mbs; i++) { 1837 n = ii[1] - ii[0]; ii++; 1838 ncols = n*bs; 1839 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1840 if (usecprow) { 1841 xtmp = x + bs*ridx[i]; 1842 } 1843 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1844 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1845 v += n*bs2; 1846 if (!usecprow) xtmp += bs; 1847 workt = work; 1848 for (j=0; j<n; j++) { 1849 zb = z + bs*(*idx++); 1850 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1851 workt += bs; 1852 } 1853 } 1854 } 1855 } 1856 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1857 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1858 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1859 PetscFunctionReturn(0); 1860 } 1861 1862 #undef __FUNCT__ 1863 #define __FUNCT__ "MatScale_SeqBAIJ" 1864 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1865 { 1866 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1867 PetscInt totalnz = a->bs2*a->nz; 1868 PetscScalar oalpha = alpha; 1869 PetscErrorCode ierr; 1870 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1871 1872 PetscFunctionBegin; 1873 BLASscal_(&tnz,&oalpha,a->a,&one); 1874 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1875 PetscFunctionReturn(0); 1876 } 1877 1878 #undef __FUNCT__ 1879 #define __FUNCT__ "MatNorm_SeqBAIJ" 1880 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1881 { 1882 PetscErrorCode ierr; 1883 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1884 MatScalar *v = a->a; 1885 PetscReal sum = 0.0; 1886 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1887 1888 PetscFunctionBegin; 1889 if (type == NORM_FROBENIUS) { 1890 for (i=0; i< bs2*nz; i++) { 1891 #if defined(PETSC_USE_COMPLEX) 1892 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1893 #else 1894 sum += (*v)*(*v); v++; 1895 #endif 1896 } 1897 *norm = sqrt(sum); 1898 } else if (type == NORM_1) { /* maximum column sum */ 1899 PetscReal *tmp; 1900 PetscInt *bcol = a->j; 1901 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1902 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1903 for (i=0; i<nz; i++){ 1904 for (j=0; j<bs; j++){ 1905 k1 = bs*(*bcol) + j; /* column index */ 1906 for (k=0; k<bs; k++){ 1907 tmp[k1] += PetscAbsScalar(*v); v++; 1908 } 1909 } 1910 bcol++; 1911 } 1912 *norm = 0.0; 1913 for (j=0; j<A->cmap->n; j++) { 1914 if (tmp[j] > *norm) *norm = tmp[j]; 1915 } 1916 ierr = PetscFree(tmp);CHKERRQ(ierr); 1917 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1918 *norm = 0.0; 1919 for (k=0; k<bs; k++) { 1920 for (j=0; j<a->mbs; j++) { 1921 v = a->a + bs2*a->i[j] + k; 1922 sum = 0.0; 1923 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1924 for (k1=0; k1<bs; k1++){ 1925 sum += PetscAbsScalar(*v); 1926 v += bs; 1927 } 1928 } 1929 if (sum > *norm) *norm = sum; 1930 } 1931 } 1932 } else { 1933 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1934 } 1935 PetscFunctionReturn(0); 1936 } 1937 1938 1939 #undef __FUNCT__ 1940 #define __FUNCT__ "MatEqual_SeqBAIJ" 1941 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg) 1942 { 1943 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1944 PetscErrorCode ierr; 1945 1946 PetscFunctionBegin; 1947 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1948 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1949 *flg = PETSC_FALSE; 1950 PetscFunctionReturn(0); 1951 } 1952 1953 /* if the a->i are the same */ 1954 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1955 if (!*flg) { 1956 PetscFunctionReturn(0); 1957 } 1958 1959 /* if a->j are the same */ 1960 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1961 if (!*flg) { 1962 PetscFunctionReturn(0); 1963 } 1964 /* if a->a are the same */ 1965 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1966 PetscFunctionReturn(0); 1967 1968 } 1969 1970 #undef __FUNCT__ 1971 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1972 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1973 { 1974 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1975 PetscErrorCode ierr; 1976 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1977 PetscScalar *x,zero = 0.0; 1978 MatScalar *aa,*aa_j; 1979 1980 PetscFunctionBegin; 1981 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1982 bs = A->rmap->bs; 1983 aa = a->a; 1984 ai = a->i; 1985 aj = a->j; 1986 ambs = a->mbs; 1987 bs2 = a->bs2; 1988 1989 ierr = VecSet(v,zero);CHKERRQ(ierr); 1990 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1991 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1992 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1993 for (i=0; i<ambs; i++) { 1994 for (j=ai[i]; j<ai[i+1]; j++) { 1995 if (aj[j] == i) { 1996 row = i*bs; 1997 aa_j = aa+j*bs2; 1998 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1999 break; 2000 } 2001 } 2002 } 2003 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2004 PetscFunctionReturn(0); 2005 } 2006 2007 #undef __FUNCT__ 2008 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2009 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2010 { 2011 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2012 PetscScalar *l,*r,x,*li,*ri; 2013 MatScalar *aa,*v; 2014 PetscErrorCode ierr; 2015 PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 2016 2017 PetscFunctionBegin; 2018 ai = a->i; 2019 aj = a->j; 2020 aa = a->a; 2021 m = A->rmap->n; 2022 n = A->cmap->n; 2023 bs = A->rmap->bs; 2024 mbs = a->mbs; 2025 bs2 = a->bs2; 2026 if (ll) { 2027 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 2028 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2029 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2030 for (i=0; i<mbs; i++) { /* for each block row */ 2031 M = ai[i+1] - ai[i]; 2032 li = l + i*bs; 2033 v = aa + bs2*ai[i]; 2034 for (j=0; j<M; j++) { /* for each block */ 2035 for (k=0; k<bs2; k++) { 2036 (*v++) *= li[k%bs]; 2037 } 2038 } 2039 } 2040 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 2041 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2042 } 2043 2044 if (rr) { 2045 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 2046 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2047 if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2048 for (i=0; i<mbs; i++) { /* for each block row */ 2049 M = ai[i+1] - ai[i]; 2050 v = aa + bs2*ai[i]; 2051 for (j=0; j<M; j++) { /* for each block */ 2052 ri = r + bs*aj[ai[i]+j]; 2053 for (k=0; k<bs; k++) { 2054 x = ri[k]; 2055 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 2056 } 2057 } 2058 } 2059 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 2060 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2061 } 2062 PetscFunctionReturn(0); 2063 } 2064 2065 2066 #undef __FUNCT__ 2067 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2068 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2069 { 2070 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2071 2072 PetscFunctionBegin; 2073 info->block_size = a->bs2; 2074 info->nz_allocated = a->maxnz; 2075 info->nz_used = a->bs2*a->nz; 2076 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2077 info->assemblies = A->num_ass; 2078 info->mallocs = a->reallocs; 2079 info->memory = ((PetscObject)A)->mem; 2080 if (A->factor) { 2081 info->fill_ratio_given = A->info.fill_ratio_given; 2082 info->fill_ratio_needed = A->info.fill_ratio_needed; 2083 info->factor_mallocs = A->info.factor_mallocs; 2084 } else { 2085 info->fill_ratio_given = 0; 2086 info->fill_ratio_needed = 0; 2087 info->factor_mallocs = 0; 2088 } 2089 PetscFunctionReturn(0); 2090 } 2091 2092 2093 #undef __FUNCT__ 2094 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2095 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2096 { 2097 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2098 PetscErrorCode ierr; 2099 2100 PetscFunctionBegin; 2101 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2102 PetscFunctionReturn(0); 2103 } 2104