1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/blockinvert.h" 5 #include "petscbt.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 10 { 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 12 PetscErrorCode ierr; 13 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 14 const PetscInt *idx; 15 PetscInt start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 18 PetscFunctionBegin; 19 m = a->mbs; 20 ai = a->i; 21 aj = a->j; 22 bs = A->rmap->bs; 23 24 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 26 ierr = PetscBTCreate(m,table);CHKERRQ(ierr); 27 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 28 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 29 30 for (i=0; i<is_max; i++) { 31 /* Initialise the two local arrays */ 32 isz = 0; 33 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 34 35 /* Extract the indices, assume there can be duplicate entries */ 36 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 37 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 38 39 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 40 for (j=0; j<n ; ++j){ 41 ival = idx[j]/bs; /* convert the indices into block indices */ 42 if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 43 if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 44 } 45 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 46 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 47 48 k = 0; 49 for (j=0; j<ov; j++){ /* for each overlap*/ 50 n = isz; 51 for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */ 52 row = nidx[k]; 53 start = ai[row]; 54 end = ai[row+1]; 55 for (l = start; l<end ; l++){ 56 val = aj[l]; 57 if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;} 58 } 59 } 60 } 61 /* expand the Index Set */ 62 for (j=0; j<isz; j++) { 63 for (k=0; k<bs; k++) 64 nidx2[j*bs+k] = nidx[j]*bs+k; 65 } 66 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 67 } 68 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 69 ierr = PetscFree(nidx);CHKERRQ(ierr); 70 ierr = PetscFree(nidx2);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 #undef __FUNCT__ 75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 77 { 78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 79 PetscErrorCode ierr; 80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 82 const PetscInt *irow,*icol; 83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 84 PetscInt *aj = a->j,*ai = a->i; 85 MatScalar *mat_a; 86 Mat C; 87 PetscTruth flag,sorted; 88 89 PetscFunctionBegin; 90 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 91 if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 92 93 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 94 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 97 98 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 99 ssmap = smap; 100 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 101 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 102 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 103 /* determine lens of each row */ 104 for (i=0; i<nrows; i++) { 105 kstart = ai[irow[i]]; 106 kend = kstart + a->ilen[irow[i]]; 107 lens[i] = 0; 108 for (k=kstart; k<kend; k++) { 109 if (ssmap[aj[k]]) { 110 lens[i]++; 111 } 112 } 113 } 114 /* Create and fill new matrix */ 115 if (scall == MAT_REUSE_MATRIX) { 116 c = (Mat_SeqBAIJ *)((*B)->data); 117 118 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 119 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 120 if (!flag) { 121 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 122 } 123 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 124 C = *B; 125 } else { 126 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 127 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 128 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 129 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 130 } 131 c = (Mat_SeqBAIJ *)(C->data); 132 for (i=0; i<nrows; i++) { 133 row = irow[i]; 134 kstart = ai[row]; 135 kend = kstart + a->ilen[row]; 136 mat_i = c->i[i]; 137 mat_j = c->j + mat_i; 138 mat_a = c->a + mat_i*bs2; 139 mat_ilen = c->ilen + i; 140 for (k=kstart; k<kend; k++) { 141 if ((tcol=ssmap[a->j[k]])) { 142 *mat_j++ = tcol - 1; 143 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 144 mat_a += bs2; 145 (*mat_ilen)++; 146 } 147 } 148 } 149 150 /* Free work space */ 151 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 152 ierr = PetscFree(smap);CHKERRQ(ierr); 153 ierr = PetscFree(lens);CHKERRQ(ierr); 154 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 155 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 157 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 158 *B = C; 159 PetscFunctionReturn(0); 160 } 161 162 #undef __FUNCT__ 163 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 164 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 165 { 166 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 167 IS is1,is2; 168 PetscErrorCode ierr; 169 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 170 const PetscInt *irow,*icol; 171 172 PetscFunctionBegin; 173 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 174 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 176 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 177 178 /* Verify if the indices corespond to each element in a block 179 and form the IS with compressed IS */ 180 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 181 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 182 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 183 count = 0; 184 for (i=0; i<a->mbs; i++) { 185 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 186 if (vary[i]==bs) iary[count++] = i; 187 } 188 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 189 190 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 191 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 192 count = 0; 193 for (i=0; i<a->mbs; i++) { 194 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc"); 195 if (vary[i]==bs) iary[count++] = i; 196 } 197 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr); 198 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 199 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 200 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 201 202 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 203 ISDestroy(is1); 204 ISDestroy(is2); 205 PetscFunctionReturn(0); 206 } 207 208 #undef __FUNCT__ 209 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 210 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 211 { 212 PetscErrorCode ierr; 213 PetscInt i; 214 215 PetscFunctionBegin; 216 if (scall == MAT_INITIAL_MATRIX) { 217 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 218 } 219 220 for (i=0; i<n; i++) { 221 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 227 /* -------------------------------------------------------*/ 228 /* Should check that shapes of vectors and matrices match */ 229 /* -------------------------------------------------------*/ 230 #include "petscblaslapack.h" 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatMult_SeqBAIJ_1" 234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 235 { 236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 237 PetscScalar *z,sum; 238 const PetscScalar *x; 239 const MatScalar *v; 240 PetscErrorCode ierr; 241 PetscInt mbs,i,n,nonzerorow=0; 242 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 243 PetscTruth usecprow=a->compressedrow.use; 244 245 PetscFunctionBegin; 246 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 247 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 248 249 if (usecprow){ 250 mbs = a->compressedrow.nrows; 251 ii = a->compressedrow.i; 252 ridx = a->compressedrow.rindex; 253 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 254 } else { 255 mbs = a->mbs; 256 ii = a->i; 257 } 258 259 for (i=0; i<mbs; i++) { 260 n = ii[1] - ii[0]; 261 v = a->a + ii[0]; 262 idx = a->j + ii[0]; 263 ii++; 264 sum = 0.0; 265 PetscSparseDensePlusDot(sum,x,v,idx,n); 266 if (usecprow){ 267 z[ridx[i]] = sum; 268 } else { 269 nonzerorow += (n>0); 270 z[i] = sum; 271 } 272 } 273 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 274 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 275 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 276 PetscFunctionReturn(0); 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatMult_SeqBAIJ_2" 281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 284 PetscScalar *z = 0,sum1,sum2,*zarray; 285 const PetscScalar *x,*xb; 286 PetscScalar x1,x2; 287 const MatScalar *v; 288 PetscErrorCode ierr; 289 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 290 PetscTruth usecprow=a->compressedrow.use; 291 292 PetscFunctionBegin; 293 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 294 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 295 296 idx = a->j; 297 v = a->a; 298 if (usecprow){ 299 mbs = a->compressedrow.nrows; 300 ii = a->compressedrow.i; 301 ridx = a->compressedrow.rindex; 302 } else { 303 mbs = a->mbs; 304 ii = a->i; 305 z = zarray; 306 } 307 308 for (i=0; i<mbs; i++) { 309 n = ii[1] - ii[0]; ii++; 310 sum1 = 0.0; sum2 = 0.0; 311 nonzerorow += (n>0); 312 for (j=0; j<n; j++) { 313 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 314 sum1 += v[0]*x1 + v[2]*x2; 315 sum2 += v[1]*x1 + v[3]*x2; 316 v += 4; 317 } 318 if (usecprow) z = zarray + 2*ridx[i]; 319 z[0] = sum1; z[1] = sum2; 320 if (!usecprow) z += 2; 321 } 322 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 323 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 324 ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr); 325 PetscFunctionReturn(0); 326 } 327 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatMult_SeqBAIJ_3" 330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 331 { 332 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 333 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 334 const PetscScalar *x,*xb; 335 const MatScalar *v; 336 PetscErrorCode ierr; 337 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 338 PetscTruth usecprow=a->compressedrow.use; 339 340 341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 342 #pragma disjoint(*v,*z,*xb) 343 #endif 344 345 PetscFunctionBegin; 346 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 347 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 348 349 idx = a->j; 350 v = a->a; 351 if (usecprow){ 352 mbs = a->compressedrow.nrows; 353 ii = a->compressedrow.i; 354 ridx = a->compressedrow.rindex; 355 } else { 356 mbs = a->mbs; 357 ii = a->i; 358 z = zarray; 359 } 360 361 for (i=0; i<mbs; i++) { 362 n = ii[1] - ii[0]; ii++; 363 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 364 nonzerorow += (n>0); 365 for (j=0; j<n; j++) { 366 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 367 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 368 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 369 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 370 v += 9; 371 } 372 if (usecprow) z = zarray + 3*ridx[i]; 373 z[0] = sum1; z[1] = sum2; z[2] = sum3; 374 if (!usecprow) z += 3; 375 } 376 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 377 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 378 ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr); 379 PetscFunctionReturn(0); 380 } 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatMult_SeqBAIJ_4" 384 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 385 { 386 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 387 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 388 const PetscScalar *x,*xb; 389 const MatScalar *v; 390 PetscErrorCode ierr; 391 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 392 PetscTruth usecprow=a->compressedrow.use; 393 394 PetscFunctionBegin; 395 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 396 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 397 398 idx = a->j; 399 v = a->a; 400 if (usecprow){ 401 mbs = a->compressedrow.nrows; 402 ii = a->compressedrow.i; 403 ridx = a->compressedrow.rindex; 404 } else { 405 mbs = a->mbs; 406 ii = a->i; 407 z = zarray; 408 } 409 410 for (i=0; i<mbs; i++) { 411 n = ii[1] - ii[0]; ii++; 412 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 413 nonzerorow += (n>0); 414 for (j=0; j<n; j++) { 415 xb = x + 4*(*idx++); 416 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 417 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 418 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 419 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 420 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 421 v += 16; 422 } 423 if (usecprow) z = zarray + 4*ridx[i]; 424 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 425 if (!usecprow) z += 4; 426 } 427 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 428 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 429 ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr); 430 PetscFunctionReturn(0); 431 } 432 433 #undef __FUNCT__ 434 #define __FUNCT__ "MatMult_SeqBAIJ_5" 435 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 436 { 437 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 438 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 439 const PetscScalar *xb,*x; 440 const MatScalar *v; 441 PetscErrorCode ierr; 442 const PetscInt *idx,*ii,*ridx=PETSC_NULL; 443 PetscInt mbs,i,j,n,nonzerorow=0; 444 PetscTruth usecprow=a->compressedrow.use; 445 446 PetscFunctionBegin; 447 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 448 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 449 450 idx = a->j; 451 v = a->a; 452 if (usecprow){ 453 mbs = a->compressedrow.nrows; 454 ii = a->compressedrow.i; 455 ridx = a->compressedrow.rindex; 456 } else { 457 mbs = a->mbs; 458 ii = a->i; 459 z = zarray; 460 } 461 462 for (i=0; i<mbs; i++) { 463 n = ii[1] - ii[0]; ii++; 464 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 465 nonzerorow += (n>0); 466 for (j=0; j<n; j++) { 467 xb = x + 5*(*idx++); 468 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 469 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 470 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 471 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 472 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 473 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 474 v += 25; 475 } 476 if (usecprow) z = zarray + 5*ridx[i]; 477 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 478 if (!usecprow) z += 5; 479 } 480 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 481 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 482 ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 487 #undef __FUNCT__ 488 #define __FUNCT__ "MatMult_SeqBAIJ_6" 489 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 490 { 491 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 492 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 493 const PetscScalar *x,*xb; 494 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 495 const MatScalar *v; 496 PetscErrorCode ierr; 497 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 498 PetscTruth usecprow=a->compressedrow.use; 499 500 PetscFunctionBegin; 501 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 502 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 503 504 idx = a->j; 505 v = a->a; 506 if (usecprow){ 507 mbs = a->compressedrow.nrows; 508 ii = a->compressedrow.i; 509 ridx = a->compressedrow.rindex; 510 } else { 511 mbs = a->mbs; 512 ii = a->i; 513 z = zarray; 514 } 515 516 for (i=0; i<mbs; i++) { 517 n = ii[1] - ii[0]; ii++; 518 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 519 nonzerorow += (n>0); 520 for (j=0; j<n; j++) { 521 xb = x + 6*(*idx++); 522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 523 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 524 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 525 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 526 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 527 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 528 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 529 v += 36; 530 } 531 if (usecprow) z = zarray + 6*ridx[i]; 532 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 533 if (!usecprow) z += 6; 534 } 535 536 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 537 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 538 ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr); 539 PetscFunctionReturn(0); 540 } 541 #undef __FUNCT__ 542 #define __FUNCT__ "MatMult_SeqBAIJ_7" 543 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 544 { 545 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 546 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 547 const PetscScalar *x,*xb; 548 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 549 const MatScalar *v; 550 PetscErrorCode ierr; 551 PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0; 552 PetscTruth usecprow=a->compressedrow.use; 553 554 PetscFunctionBegin; 555 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 556 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 557 558 idx = a->j; 559 v = a->a; 560 if (usecprow){ 561 mbs = a->compressedrow.nrows; 562 ii = a->compressedrow.i; 563 ridx = a->compressedrow.rindex; 564 } else { 565 mbs = a->mbs; 566 ii = a->i; 567 z = zarray; 568 } 569 570 for (i=0; i<mbs; i++) { 571 n = ii[1] - ii[0]; ii++; 572 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 573 nonzerorow += (n>0); 574 for (j=0; j<n; j++) { 575 xb = x + 7*(*idx++); 576 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 577 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 578 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 579 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 580 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 581 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 582 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 583 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 584 v += 49; 585 } 586 if (usecprow) z = zarray + 7*ridx[i]; 587 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 588 if (!usecprow) z += 7; 589 } 590 591 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 592 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 593 ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr); 594 PetscFunctionReturn(0); 595 } 596 597 /* 598 This will not work with MatScalar == float because it calls the BLAS 599 */ 600 #undef __FUNCT__ 601 #define __FUNCT__ "MatMult_SeqBAIJ_N" 602 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 603 { 604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 605 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 606 MatScalar *v; 607 PetscErrorCode ierr; 608 PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 609 PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0; 610 PetscTruth usecprow=a->compressedrow.use; 611 612 PetscFunctionBegin; 613 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 614 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 615 616 idx = a->j; 617 v = a->a; 618 if (usecprow){ 619 mbs = a->compressedrow.nrows; 620 ii = a->compressedrow.i; 621 ridx = a->compressedrow.rindex; 622 } else { 623 mbs = a->mbs; 624 ii = a->i; 625 z = zarray; 626 } 627 628 if (!a->mult_work) { 629 k = PetscMax(A->rmap->n,A->cmap->n); 630 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 631 } 632 work = a->mult_work; 633 for (i=0; i<mbs; i++) { 634 n = ii[1] - ii[0]; ii++; 635 ncols = n*bs; 636 workt = work; 637 nonzerorow += (n>0); 638 for (j=0; j<n; j++) { 639 xb = x + bs*(*idx++); 640 for (k=0; k<bs; k++) workt[k] = xb[k]; 641 workt += bs; 642 } 643 if (usecprow) z = zarray + bs*ridx[i]; 644 Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 645 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 646 v += n*bs2; 647 if (!usecprow) z += bs; 648 } 649 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 650 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 651 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr); 652 PetscFunctionReturn(0); 653 } 654 655 extern PetscErrorCode VecCopy_Seq(Vec,Vec); 656 #undef __FUNCT__ 657 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 658 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 659 { 660 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 661 const PetscScalar *x; 662 PetscScalar *y,*z,sum; 663 const MatScalar *v; 664 PetscErrorCode ierr; 665 PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0; 666 const PetscInt *idx,*ii; 667 PetscTruth usecprow=a->compressedrow.use; 668 669 PetscFunctionBegin; 670 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 671 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 672 if (zz != yy) { 673 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 674 } else { 675 z = y; 676 } 677 678 idx = a->j; 679 v = a->a; 680 if (usecprow){ 681 if (zz != yy){ 682 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 683 } 684 mbs = a->compressedrow.nrows; 685 ii = a->compressedrow.i; 686 ridx = a->compressedrow.rindex; 687 } else { 688 ii = a->i; 689 } 690 691 for (i=0; i<mbs; i++) { 692 n = ii[1] - ii[0]; 693 ii++; 694 if (!usecprow){ 695 nonzerorow += (n>0); 696 sum = y[i]; 697 } else { 698 sum = y[ridx[i]]; 699 } 700 PetscSparseDensePlusDot(sum,x,v,idx,n); 701 v += n; 702 idx += n; 703 if (usecprow){ 704 z[ridx[i]] = sum; 705 } else { 706 z[i] = sum; 707 } 708 } 709 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 710 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 711 if (zz != yy) { 712 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 713 } 714 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 715 PetscFunctionReturn(0); 716 } 717 718 #undef __FUNCT__ 719 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 720 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 721 { 722 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 723 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 724 PetscScalar x1,x2,*yarray,*zarray; 725 MatScalar *v; 726 PetscErrorCode ierr; 727 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 728 PetscTruth usecprow=a->compressedrow.use; 729 730 PetscFunctionBegin; 731 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 732 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 733 if (zz != yy) { 734 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 735 } else { 736 zarray = yarray; 737 } 738 739 idx = a->j; 740 v = a->a; 741 if (usecprow){ 742 if (zz != yy){ 743 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 744 } 745 mbs = a->compressedrow.nrows; 746 ii = a->compressedrow.i; 747 ridx = a->compressedrow.rindex; 748 if (zz != yy){ 749 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 750 } 751 } else { 752 ii = a->i; 753 y = yarray; 754 z = zarray; 755 } 756 757 for (i=0; i<mbs; i++) { 758 n = ii[1] - ii[0]; ii++; 759 if (usecprow){ 760 z = zarray + 2*ridx[i]; 761 y = yarray + 2*ridx[i]; 762 } 763 sum1 = y[0]; sum2 = y[1]; 764 for (j=0; j<n; j++) { 765 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 766 sum1 += v[0]*x1 + v[2]*x2; 767 sum2 += v[1]*x1 + v[3]*x2; 768 v += 4; 769 } 770 z[0] = sum1; z[1] = sum2; 771 if (!usecprow){ 772 z += 2; y += 2; 773 } 774 } 775 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 776 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 777 if (zz != yy) { 778 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 779 } 780 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 781 PetscFunctionReturn(0); 782 } 783 784 #undef __FUNCT__ 785 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 786 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 787 { 788 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 789 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 790 MatScalar *v; 791 PetscErrorCode ierr; 792 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 793 PetscTruth usecprow=a->compressedrow.use; 794 795 PetscFunctionBegin; 796 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 797 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 798 if (zz != yy) { 799 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 800 } else { 801 zarray = yarray; 802 } 803 804 idx = a->j; 805 v = a->a; 806 if (usecprow){ 807 if (zz != yy){ 808 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 809 } 810 mbs = a->compressedrow.nrows; 811 ii = a->compressedrow.i; 812 ridx = a->compressedrow.rindex; 813 } else { 814 ii = a->i; 815 y = yarray; 816 z = zarray; 817 } 818 819 for (i=0; i<mbs; i++) { 820 n = ii[1] - ii[0]; ii++; 821 if (usecprow){ 822 z = zarray + 3*ridx[i]; 823 y = yarray + 3*ridx[i]; 824 } 825 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 826 for (j=0; j<n; j++) { 827 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 828 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 829 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 830 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 831 v += 9; 832 } 833 z[0] = sum1; z[1] = sum2; z[2] = sum3; 834 if (!usecprow){ 835 z += 3; y += 3; 836 } 837 } 838 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 839 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 840 if (zz != yy) { 841 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 842 } 843 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 844 PetscFunctionReturn(0); 845 } 846 847 #undef __FUNCT__ 848 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 849 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 850 { 851 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 852 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 853 MatScalar *v; 854 PetscErrorCode ierr; 855 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 856 PetscTruth usecprow=a->compressedrow.use; 857 858 PetscFunctionBegin; 859 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 860 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 861 if (zz != yy) { 862 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 863 } else { 864 zarray = yarray; 865 } 866 867 idx = a->j; 868 v = a->a; 869 if (usecprow){ 870 if (zz != yy){ 871 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 872 } 873 mbs = a->compressedrow.nrows; 874 ii = a->compressedrow.i; 875 ridx = a->compressedrow.rindex; 876 } else { 877 ii = a->i; 878 y = yarray; 879 z = zarray; 880 } 881 882 for (i=0; i<mbs; i++) { 883 n = ii[1] - ii[0]; ii++; 884 if (usecprow){ 885 z = zarray + 4*ridx[i]; 886 y = yarray + 4*ridx[i]; 887 } 888 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 889 for (j=0; j<n; j++) { 890 xb = x + 4*(*idx++); 891 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 892 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 893 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 894 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 895 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 896 v += 16; 897 } 898 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 899 if (!usecprow){ 900 z += 4; y += 4; 901 } 902 } 903 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 904 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 905 if (zz != yy) { 906 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 907 } 908 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 909 PetscFunctionReturn(0); 910 } 911 912 #undef __FUNCT__ 913 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 914 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 915 { 916 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 917 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 918 PetscScalar *yarray,*zarray; 919 MatScalar *v; 920 PetscErrorCode ierr; 921 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 922 PetscTruth usecprow=a->compressedrow.use; 923 924 PetscFunctionBegin; 925 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 926 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 927 if (zz != yy) { 928 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 929 } else { 930 zarray = yarray; 931 } 932 933 idx = a->j; 934 v = a->a; 935 if (usecprow){ 936 if (zz != yy){ 937 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 938 } 939 mbs = a->compressedrow.nrows; 940 ii = a->compressedrow.i; 941 ridx = a->compressedrow.rindex; 942 } else { 943 ii = a->i; 944 y = yarray; 945 z = zarray; 946 } 947 948 for (i=0; i<mbs; i++) { 949 n = ii[1] - ii[0]; ii++; 950 if (usecprow){ 951 z = zarray + 5*ridx[i]; 952 y = yarray + 5*ridx[i]; 953 } 954 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 955 for (j=0; j<n; j++) { 956 xb = x + 5*(*idx++); 957 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 958 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 959 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 960 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 961 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 962 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 963 v += 25; 964 } 965 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 966 if (!usecprow){ 967 z += 5; y += 5; 968 } 969 } 970 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 971 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 972 if (zz != yy) { 973 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 974 } 975 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 976 PetscFunctionReturn(0); 977 } 978 #undef __FUNCT__ 979 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 980 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 981 { 982 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 983 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 984 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 985 MatScalar *v; 986 PetscErrorCode ierr; 987 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 988 PetscTruth usecprow=a->compressedrow.use; 989 990 PetscFunctionBegin; 991 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 992 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 993 if (zz != yy) { 994 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 995 } else { 996 zarray = yarray; 997 } 998 999 idx = a->j; 1000 v = a->a; 1001 if (usecprow){ 1002 if (zz != yy){ 1003 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1004 } 1005 mbs = a->compressedrow.nrows; 1006 ii = a->compressedrow.i; 1007 ridx = a->compressedrow.rindex; 1008 } else { 1009 ii = a->i; 1010 y = yarray; 1011 z = zarray; 1012 } 1013 1014 for (i=0; i<mbs; i++) { 1015 n = ii[1] - ii[0]; ii++; 1016 if (usecprow){ 1017 z = zarray + 6*ridx[i]; 1018 y = yarray + 6*ridx[i]; 1019 } 1020 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1021 for (j=0; j<n; j++) { 1022 xb = x + 6*(*idx++); 1023 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1024 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1025 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1026 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1027 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1028 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1029 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1030 v += 36; 1031 } 1032 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1033 if (!usecprow){ 1034 z += 6; y += 6; 1035 } 1036 } 1037 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1038 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1039 if (zz != yy) { 1040 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1041 } 1042 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1043 PetscFunctionReturn(0); 1044 } 1045 1046 #undef __FUNCT__ 1047 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 1048 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1049 { 1050 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1051 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1052 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1053 MatScalar *v; 1054 PetscErrorCode ierr; 1055 PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL; 1056 PetscTruth usecprow=a->compressedrow.use; 1057 1058 PetscFunctionBegin; 1059 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1060 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1061 if (zz != yy) { 1062 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1063 } else { 1064 zarray = yarray; 1065 } 1066 1067 idx = a->j; 1068 v = a->a; 1069 if (usecprow){ 1070 if (zz != yy){ 1071 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1072 } 1073 mbs = a->compressedrow.nrows; 1074 ii = a->compressedrow.i; 1075 ridx = a->compressedrow.rindex; 1076 } else { 1077 ii = a->i; 1078 y = yarray; 1079 z = zarray; 1080 } 1081 1082 for (i=0; i<mbs; i++) { 1083 n = ii[1] - ii[0]; ii++; 1084 if (usecprow){ 1085 z = zarray + 7*ridx[i]; 1086 y = yarray + 7*ridx[i]; 1087 } 1088 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1089 for (j=0; j<n; j++) { 1090 xb = x + 7*(*idx++); 1091 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1092 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1093 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1094 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1095 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1096 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1097 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1098 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1099 v += 49; 1100 } 1101 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1102 if (!usecprow){ 1103 z += 7; y += 7; 1104 } 1105 } 1106 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1107 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1108 if (zz != yy) { 1109 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1110 } 1111 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1112 PetscFunctionReturn(0); 1113 } 1114 1115 #undef __FUNCT__ 1116 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 1117 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1118 { 1119 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1120 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1121 MatScalar *v; 1122 PetscErrorCode ierr; 1123 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1124 PetscInt ncols,k,*ridx=PETSC_NULL; 1125 PetscTruth usecprow=a->compressedrow.use; 1126 1127 PetscFunctionBegin; 1128 ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); 1129 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1130 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1131 1132 idx = a->j; 1133 v = a->a; 1134 if (usecprow){ 1135 mbs = a->compressedrow.nrows; 1136 ii = a->compressedrow.i; 1137 ridx = a->compressedrow.rindex; 1138 } else { 1139 mbs = a->mbs; 1140 ii = a->i; 1141 z = zarray; 1142 } 1143 1144 if (!a->mult_work) { 1145 k = PetscMax(A->rmap->n,A->cmap->n); 1146 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1147 } 1148 work = a->mult_work; 1149 for (i=0; i<mbs; i++) { 1150 n = ii[1] - ii[0]; ii++; 1151 ncols = n*bs; 1152 workt = work; 1153 for (j=0; j<n; j++) { 1154 xb = x + bs*(*idx++); 1155 for (k=0; k<bs; k++) workt[k] = xb[k]; 1156 workt += bs; 1157 } 1158 if (usecprow) z = zarray + bs*ridx[i]; 1159 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1160 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1161 v += n*bs2; 1162 if (!usecprow){ 1163 z += bs; 1164 } 1165 } 1166 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 1174 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1175 { 1176 PetscScalar zero = 0.0; 1177 PetscErrorCode ierr; 1178 1179 PetscFunctionBegin; 1180 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1181 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 #undef __FUNCT__ 1186 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 1187 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1188 { 1189 PetscScalar zero = 0.0; 1190 PetscErrorCode ierr; 1191 1192 PetscFunctionBegin; 1193 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1194 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1195 PetscFunctionReturn(0); 1196 } 1197 1198 #undef __FUNCT__ 1199 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 1200 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1201 1202 { 1203 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1204 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1205 MatScalar *v; 1206 PetscErrorCode ierr; 1207 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1208 Mat_CompressedRow cprow = a->compressedrow; 1209 PetscTruth usecprow=cprow.use; 1210 1211 PetscFunctionBegin; 1212 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1213 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1214 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1215 1216 idx = a->j; 1217 v = a->a; 1218 if (usecprow){ 1219 mbs = cprow.nrows; 1220 ii = cprow.i; 1221 ridx = cprow.rindex; 1222 } else { 1223 mbs=a->mbs; 1224 ii = a->i; 1225 xb = x; 1226 } 1227 1228 switch (bs) { 1229 case 1: 1230 for (i=0; i<mbs; i++) { 1231 if (usecprow) xb = x + ridx[i]; 1232 x1 = xb[0]; 1233 ib = idx + ii[0]; 1234 n = ii[1] - ii[0]; ii++; 1235 for (j=0; j<n; j++) { 1236 rval = ib[j]; 1237 z[rval] += PetscConj(*v) * x1; 1238 v++; 1239 } 1240 if (!usecprow) xb++; 1241 } 1242 break; 1243 case 2: 1244 for (i=0; i<mbs; i++) { 1245 if (usecprow) xb = x + 2*ridx[i]; 1246 x1 = xb[0]; x2 = xb[1]; 1247 ib = idx + ii[0]; 1248 n = ii[1] - ii[0]; ii++; 1249 for (j=0; j<n; j++) { 1250 rval = ib[j]*2; 1251 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1252 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1253 v += 4; 1254 } 1255 if (!usecprow) xb += 2; 1256 } 1257 break; 1258 case 3: 1259 for (i=0; i<mbs; i++) { 1260 if (usecprow) xb = x + 3*ridx[i]; 1261 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1262 ib = idx + ii[0]; 1263 n = ii[1] - ii[0]; ii++; 1264 for (j=0; j<n; j++) { 1265 rval = ib[j]*3; 1266 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1267 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1268 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1269 v += 9; 1270 } 1271 if (!usecprow) xb += 3; 1272 } 1273 break; 1274 case 4: 1275 for (i=0; i<mbs; i++) { 1276 if (usecprow) xb = x + 4*ridx[i]; 1277 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1278 ib = idx + ii[0]; 1279 n = ii[1] - ii[0]; ii++; 1280 for (j=0; j<n; j++) { 1281 rval = ib[j]*4; 1282 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1283 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1284 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1285 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1286 v += 16; 1287 } 1288 if (!usecprow) xb += 4; 1289 } 1290 break; 1291 case 5: 1292 for (i=0; i<mbs; i++) { 1293 if (usecprow) xb = x + 5*ridx[i]; 1294 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1295 x4 = xb[3]; x5 = xb[4]; 1296 ib = idx + ii[0]; 1297 n = ii[1] - ii[0]; ii++; 1298 for (j=0; j<n; j++) { 1299 rval = ib[j]*5; 1300 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1301 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1302 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1303 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1304 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1305 v += 25; 1306 } 1307 if (!usecprow) xb += 5; 1308 } 1309 break; 1310 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 1311 PetscInt ncols,k; 1312 PetscScalar *work,*workt,*xtmp; 1313 1314 SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1315 if (!a->mult_work) { 1316 k = PetscMax(A->rmap->n,A->cmap->n); 1317 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1318 } 1319 work = a->mult_work; 1320 xtmp = x; 1321 for (i=0; i<mbs; i++) { 1322 n = ii[1] - ii[0]; ii++; 1323 ncols = n*bs; 1324 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1325 if (usecprow) { 1326 xtmp = x + bs*ridx[i]; 1327 } 1328 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1329 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1330 v += n*bs2; 1331 if (!usecprow) xtmp += bs; 1332 workt = work; 1333 for (j=0; j<n; j++) { 1334 zb = z + bs*(*idx++); 1335 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1336 workt += bs; 1337 } 1338 } 1339 } 1340 } 1341 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1342 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1343 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1344 PetscFunctionReturn(0); 1345 } 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 1349 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1350 { 1351 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1352 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 1353 MatScalar *v; 1354 PetscErrorCode ierr; 1355 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL; 1356 Mat_CompressedRow cprow = a->compressedrow; 1357 PetscTruth usecprow=cprow.use; 1358 1359 PetscFunctionBegin; 1360 if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); } 1361 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1362 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1363 1364 idx = a->j; 1365 v = a->a; 1366 if (usecprow){ 1367 mbs = cprow.nrows; 1368 ii = cprow.i; 1369 ridx = cprow.rindex; 1370 } else { 1371 mbs=a->mbs; 1372 ii = a->i; 1373 xb = x; 1374 } 1375 1376 switch (bs) { 1377 case 1: 1378 for (i=0; i<mbs; i++) { 1379 if (usecprow) xb = x + ridx[i]; 1380 x1 = xb[0]; 1381 ib = idx + ii[0]; 1382 n = ii[1] - ii[0]; ii++; 1383 for (j=0; j<n; j++) { 1384 rval = ib[j]; 1385 z[rval] += *v * x1; 1386 v++; 1387 } 1388 if (!usecprow) xb++; 1389 } 1390 break; 1391 case 2: 1392 for (i=0; i<mbs; i++) { 1393 if (usecprow) xb = x + 2*ridx[i]; 1394 x1 = xb[0]; x2 = xb[1]; 1395 ib = idx + ii[0]; 1396 n = ii[1] - ii[0]; ii++; 1397 for (j=0; j<n; j++) { 1398 rval = ib[j]*2; 1399 z[rval++] += v[0]*x1 + v[1]*x2; 1400 z[rval++] += v[2]*x1 + v[3]*x2; 1401 v += 4; 1402 } 1403 if (!usecprow) xb += 2; 1404 } 1405 break; 1406 case 3: 1407 for (i=0; i<mbs; i++) { 1408 if (usecprow) xb = x + 3*ridx[i]; 1409 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1410 ib = idx + ii[0]; 1411 n = ii[1] - ii[0]; ii++; 1412 for (j=0; j<n; j++) { 1413 rval = ib[j]*3; 1414 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1415 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1416 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1417 v += 9; 1418 } 1419 if (!usecprow) xb += 3; 1420 } 1421 break; 1422 case 4: 1423 for (i=0; i<mbs; i++) { 1424 if (usecprow) xb = x + 4*ridx[i]; 1425 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1426 ib = idx + ii[0]; 1427 n = ii[1] - ii[0]; ii++; 1428 for (j=0; j<n; j++) { 1429 rval = ib[j]*4; 1430 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1431 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1432 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1433 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1434 v += 16; 1435 } 1436 if (!usecprow) xb += 4; 1437 } 1438 break; 1439 case 5: 1440 for (i=0; i<mbs; i++) { 1441 if (usecprow) xb = x + 5*ridx[i]; 1442 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1443 x4 = xb[3]; x5 = xb[4]; 1444 ib = idx + ii[0]; 1445 n = ii[1] - ii[0]; ii++; 1446 for (j=0; j<n; j++) { 1447 rval = ib[j]*5; 1448 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1449 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1450 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1451 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1452 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1453 v += 25; 1454 } 1455 if (!usecprow) xb += 5; 1456 } 1457 break; 1458 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1459 PetscInt ncols,k; 1460 PetscScalar *work,*workt,*xtmp; 1461 1462 if (!a->mult_work) { 1463 k = PetscMax(A->rmap->n,A->cmap->n); 1464 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1465 } 1466 work = a->mult_work; 1467 xtmp = x; 1468 for (i=0; i<mbs; i++) { 1469 n = ii[1] - ii[0]; ii++; 1470 ncols = n*bs; 1471 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1472 if (usecprow) { 1473 xtmp = x + bs*ridx[i]; 1474 } 1475 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1476 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1477 v += n*bs2; 1478 if (!usecprow) xtmp += bs; 1479 workt = work; 1480 for (j=0; j<n; j++) { 1481 zb = z + bs*(*idx++); 1482 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1483 workt += bs; 1484 } 1485 } 1486 } 1487 } 1488 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1489 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1490 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1491 PetscFunctionReturn(0); 1492 } 1493 1494 #undef __FUNCT__ 1495 #define __FUNCT__ "MatScale_SeqBAIJ" 1496 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1497 { 1498 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1499 PetscInt totalnz = a->bs2*a->nz; 1500 PetscScalar oalpha = alpha; 1501 PetscErrorCode ierr; 1502 PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz); 1503 1504 PetscFunctionBegin; 1505 BLASscal_(&tnz,&oalpha,a->a,&one); 1506 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1507 PetscFunctionReturn(0); 1508 } 1509 1510 #undef __FUNCT__ 1511 #define __FUNCT__ "MatNorm_SeqBAIJ" 1512 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1513 { 1514 PetscErrorCode ierr; 1515 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1516 MatScalar *v = a->a; 1517 PetscReal sum = 0.0; 1518 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1519 1520 PetscFunctionBegin; 1521 if (type == NORM_FROBENIUS) { 1522 for (i=0; i< bs2*nz; i++) { 1523 #if defined(PETSC_USE_COMPLEX) 1524 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 1525 #else 1526 sum += (*v)*(*v); v++; 1527 #endif 1528 } 1529 *norm = sqrt(sum); 1530 } else if (type == NORM_1) { /* maximum column sum */ 1531 PetscReal *tmp; 1532 PetscInt *bcol = a->j; 1533 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 1534 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 1535 for (i=0; i<nz; i++){ 1536 for (j=0; j<bs; j++){ 1537 k1 = bs*(*bcol) + j; /* column index */ 1538 for (k=0; k<bs; k++){ 1539 tmp[k1] += PetscAbsScalar(*v); v++; 1540 } 1541 } 1542 bcol++; 1543 } 1544 *norm = 0.0; 1545 for (j=0; j<A->cmap->n; j++) { 1546 if (tmp[j] > *norm) *norm = tmp[j]; 1547 } 1548 ierr = PetscFree(tmp);CHKERRQ(ierr); 1549 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1550 *norm = 0.0; 1551 for (k=0; k<bs; k++) { 1552 for (j=0; j<a->mbs; j++) { 1553 v = a->a + bs2*a->i[j] + k; 1554 sum = 0.0; 1555 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 1556 for (k1=0; k1<bs; k1++){ 1557 sum += PetscAbsScalar(*v); 1558 v += bs; 1559 } 1560 } 1561 if (sum > *norm) *norm = sum; 1562 } 1563 } 1564 } else { 1565 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1566 } 1567 PetscFunctionReturn(0); 1568 } 1569 1570 1571 #undef __FUNCT__ 1572 #define __FUNCT__ "MatEqual_SeqBAIJ" 1573 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg) 1574 { 1575 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data; 1576 PetscErrorCode ierr; 1577 1578 PetscFunctionBegin; 1579 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1580 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1581 *flg = PETSC_FALSE; 1582 PetscFunctionReturn(0); 1583 } 1584 1585 /* if the a->i are the same */ 1586 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1587 if (!*flg) { 1588 PetscFunctionReturn(0); 1589 } 1590 1591 /* if a->j are the same */ 1592 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1593 if (!*flg) { 1594 PetscFunctionReturn(0); 1595 } 1596 /* if a->a are the same */ 1597 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1598 PetscFunctionReturn(0); 1599 1600 } 1601 1602 #undef __FUNCT__ 1603 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 1604 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 1605 { 1606 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1607 PetscErrorCode ierr; 1608 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1609 PetscScalar *x,zero = 0.0; 1610 MatScalar *aa,*aa_j; 1611 1612 PetscFunctionBegin; 1613 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1614 bs = A->rmap->bs; 1615 aa = a->a; 1616 ai = a->i; 1617 aj = a->j; 1618 ambs = a->mbs; 1619 bs2 = a->bs2; 1620 1621 ierr = VecSet(v,zero);CHKERRQ(ierr); 1622 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1623 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1624 if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1625 for (i=0; i<ambs; i++) { 1626 for (j=ai[i]; j<ai[i+1]; j++) { 1627 if (aj[j] == i) { 1628 row = i*bs; 1629 aa_j = aa+j*bs2; 1630 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1631 break; 1632 } 1633 } 1634 } 1635 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1636 PetscFunctionReturn(0); 1637 } 1638 1639 #undef __FUNCT__ 1640 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 1641 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 1642 { 1643 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1644 PetscScalar *l,*r,x,*li,*ri; 1645 MatScalar *aa,*v; 1646 PetscErrorCode ierr; 1647 PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2; 1648 1649 PetscFunctionBegin; 1650 ai = a->i; 1651 aj = a->j; 1652 aa = a->a; 1653 m = A->rmap->n; 1654 n = A->cmap->n; 1655 bs = A->rmap->bs; 1656 mbs = a->mbs; 1657 bs2 = a->bs2; 1658 if (ll) { 1659 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1660 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1661 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1662 for (i=0; i<mbs; i++) { /* for each block row */ 1663 M = ai[i+1] - ai[i]; 1664 li = l + i*bs; 1665 v = aa + bs2*ai[i]; 1666 for (j=0; j<M; j++) { /* for each block */ 1667 for (k=0; k<bs2; k++) { 1668 (*v++) *= li[k%bs]; 1669 } 1670 } 1671 } 1672 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1673 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1674 } 1675 1676 if (rr) { 1677 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 1678 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1679 if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1680 for (i=0; i<mbs; i++) { /* for each block row */ 1681 M = ai[i+1] - ai[i]; 1682 v = aa + bs2*ai[i]; 1683 for (j=0; j<M; j++) { /* for each block */ 1684 ri = r + bs*aj[ai[i]+j]; 1685 for (k=0; k<bs; k++) { 1686 x = ri[k]; 1687 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1688 } 1689 } 1690 } 1691 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 1692 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1693 } 1694 PetscFunctionReturn(0); 1695 } 1696 1697 1698 #undef __FUNCT__ 1699 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 1700 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1701 { 1702 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1703 1704 PetscFunctionBegin; 1705 info->block_size = a->bs2; 1706 info->nz_allocated = a->maxnz; 1707 info->nz_used = a->bs2*a->nz; 1708 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1709 info->assemblies = A->num_ass; 1710 info->mallocs = a->reallocs; 1711 info->memory = ((PetscObject)A)->mem; 1712 if (A->factor) { 1713 info->fill_ratio_given = A->info.fill_ratio_given; 1714 info->fill_ratio_needed = A->info.fill_ratio_needed; 1715 info->factor_mallocs = A->info.factor_mallocs; 1716 } else { 1717 info->fill_ratio_given = 0; 1718 info->fill_ratio_needed = 0; 1719 info->factor_mallocs = 0; 1720 } 1721 PetscFunctionReturn(0); 1722 } 1723 1724 1725 #undef __FUNCT__ 1726 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 1727 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 1728 { 1729 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1730 PetscErrorCode ierr; 1731 1732 PetscFunctionBegin; 1733 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1734 PetscFunctionReturn(0); 1735 } 1736