1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 #if defined(PETSC_THREADCOMM_ACTIVE) 7 #include <petscthreadcomm.h> 8 #endif 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 79 { 80 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 81 PetscErrorCode ierr; 82 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 83 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 84 const PetscInt *irow,*icol; 85 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 86 PetscInt *aj = a->j,*ai = a->i; 87 MatScalar *mat_a; 88 Mat C; 89 PetscBool flag; 90 91 PetscFunctionBegin; 92 93 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 94 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 97 98 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 99 ssmap = smap; 100 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 101 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 102 /* determine lens of each row */ 103 for (i=0; i<nrows; i++) { 104 kstart = ai[irow[i]]; 105 kend = kstart + a->ilen[irow[i]]; 106 lens[i] = 0; 107 for (k=kstart; k<kend; k++) { 108 if (ssmap[aj[k]]) lens[i]++; 109 } 110 } 111 /* Create and fill new matrix */ 112 if (scall == MAT_REUSE_MATRIX) { 113 c = (Mat_SeqBAIJ*)((*B)->data); 114 115 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 116 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 117 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 118 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 119 C = *B; 120 } else { 121 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 122 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 123 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 124 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 125 } 126 c = (Mat_SeqBAIJ*)(C->data); 127 for (i=0; i<nrows; i++) { 128 row = irow[i]; 129 kstart = ai[row]; 130 kend = kstart + a->ilen[row]; 131 mat_i = c->i[i]; 132 mat_j = c->j + mat_i; 133 mat_a = c->a + mat_i*bs2; 134 mat_ilen = c->ilen + i; 135 for (k=kstart; k<kend; k++) { 136 if ((tcol=ssmap[a->j[k]])) { 137 *mat_j++ = tcol - 1; 138 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 139 mat_a += bs2; 140 (*mat_ilen)++; 141 } 142 } 143 } 144 /* sort */ 145 { 146 MatScalar *work; 147 148 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 149 for (i=0; i<nrows; i++) { 150 PetscInt ilen; 151 mat_i = c->i[i]; 152 mat_j = c->j + mat_i; 153 mat_a = c->a + mat_i*bs2; 154 ilen = c->ilen[i]; 155 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 156 } 157 ierr = PetscFree(work);CHKERRQ(ierr); 158 } 159 160 /* Free work space */ 161 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 162 ierr = PetscFree(smap);CHKERRQ(ierr); 163 ierr = PetscFree(lens);CHKERRQ(ierr); 164 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 165 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 166 167 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 168 *B = C; 169 PetscFunctionReturn(0); 170 } 171 172 #undef __FUNCT__ 173 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 174 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 175 { 176 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 177 IS is1,is2; 178 PetscErrorCode ierr; 179 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 180 const PetscInt *irow,*icol; 181 182 PetscFunctionBegin; 183 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 184 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 185 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 186 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 187 188 /* Verify if the indices corespond to each element in a block 189 and form the IS with compressed IS */ 190 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 191 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 192 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 193 for (i=0; i<a->mbs; i++) { 194 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 195 } 196 count = 0; 197 for (i=0; i<nrows; i++) { 198 PetscInt j = irow[i] / bs; 199 if ((vary[j]--)==bs) iary[count++] = j; 200 } 201 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 202 203 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 204 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 205 for (i=0; i<a->mbs; i++) { 206 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 207 if (vary[i]==bs) iary[count++] = i; 208 } 209 count = 0; 210 for (i=0; i<ncols; i++) { 211 PetscInt j = icol[i] / bs; 212 if ((vary[j]--)==bs) iary[count++] = j; 213 } 214 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 215 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 216 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 217 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 218 219 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 220 ierr = ISDestroy(&is1);CHKERRQ(ierr); 221 ierr = ISDestroy(&is2);CHKERRQ(ierr); 222 PetscFunctionReturn(0); 223 } 224 225 #undef __FUNCT__ 226 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 227 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 228 { 229 PetscErrorCode ierr; 230 PetscInt i; 231 232 PetscFunctionBegin; 233 if (scall == MAT_INITIAL_MATRIX) { 234 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 235 } 236 237 for (i=0; i<n; i++) { 238 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 239 } 240 PetscFunctionReturn(0); 241 } 242 243 244 /* -------------------------------------------------------*/ 245 /* Should check that shapes of vectors and matrices match */ 246 /* -------------------------------------------------------*/ 247 248 #if defined(PETSC_THREADCOMM_ACTIVE) 249 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 250 { 251 PetscErrorCode ierr; 252 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 253 PetscScalar *z; 254 const PetscScalar *x; 255 const MatScalar *aa; 256 PetscInt *trstarts=A->rmap->trstarts; 257 PetscInt n,start,end,i; 258 const PetscInt *aj,*ai; 259 PetscScalar sum; 260 261 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 262 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 263 start = trstarts[thread_id]; 264 end = trstarts[thread_id+1]; 265 ai = a->i; 266 for (i=start; i<end; i++) { 267 n = ai[i+1] - ai[i]; 268 aj = a->j + ai[i]; 269 aa = a->a + ai[i]; 270 sum = 0.0; 271 PetscSparseDensePlusDot(sum,x,aa,aj,n); 272 z[i] = sum; 273 } 274 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 275 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 276 return 0; 277 } 278 279 #undef __FUNCT__ 280 #define __FUNCT__ "MatMult_SeqBAIJ_1" 281 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 282 { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 284 PetscScalar *z,sum; 285 const PetscScalar *x; 286 const MatScalar *v; 287 PetscErrorCode ierr; 288 PetscInt mbs,i,n; 289 const PetscInt *idx,*ii,*ridx=NULL; 290 PetscBool usecprow=a->compressedrow.use; 291 292 PetscFunctionBegin; 293 294 if (usecprow) { 295 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 296 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 297 mbs = a->compressedrow.nrows; 298 ii = a->compressedrow.i; 299 ridx = a->compressedrow.rindex; 300 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 301 for (i=0; i<mbs; i++) { 302 n = ii[i+1] - ii[i]; 303 v = a->a + ii[i]; 304 idx = a->j + ii[i]; 305 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 306 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 307 sum = 0.0; 308 PetscSparseDensePlusDot(sum,x,v,idx,n); 309 z[ridx[i]] = sum; 310 } 311 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 312 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 313 } else { 314 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr); 315 } 316 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 317 PetscFunctionReturn(0); 318 } 319 #else 320 #undef __FUNCT__ 321 #define __FUNCT__ "MatMult_SeqBAIJ_1" 322 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 323 { 324 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 325 PetscScalar *z,sum; 326 const PetscScalar *x; 327 const MatScalar *v; 328 PetscErrorCode ierr; 329 PetscInt mbs,i,n; 330 const PetscInt *idx,*ii,*ridx=NULL; 331 PetscBool usecprow=a->compressedrow.use; 332 333 PetscFunctionBegin; 334 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 335 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 336 337 if (usecprow) { 338 mbs = a->compressedrow.nrows; 339 ii = a->compressedrow.i; 340 ridx = a->compressedrow.rindex; 341 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 342 } else { 343 mbs = a->mbs; 344 ii = a->i; 345 } 346 347 for (i=0; i<mbs; i++) { 348 n = ii[1] - ii[0]; 349 v = a->a + ii[0]; 350 idx = a->j + ii[0]; 351 ii++; 352 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 353 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 354 sum = 0.0; 355 PetscSparseDensePlusDot(sum,x,v,idx,n); 356 if (usecprow) { 357 z[ridx[i]] = sum; 358 } else { 359 z[i] = sum; 360 } 361 } 362 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 363 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 364 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 365 PetscFunctionReturn(0); 366 } 367 #endif 368 369 #if defined(PETSC_THREADCOMM_ACTIVE) 370 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 371 { 372 PetscErrorCode ierr; 373 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 374 PetscScalar *z,x1,x2,sum1,sum2; 375 const PetscScalar *x,*xb; 376 const MatScalar *aa; 377 PetscInt *trstarts=A->rmap->trstarts; 378 PetscInt n,start,end,i,j; 379 const PetscInt *aj,*ai; 380 381 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 382 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 383 start = trstarts[thread_id] / 2; 384 end = trstarts[thread_id+1] / 2; 385 ai = a->i; 386 for (i=start; i<end; i++) { 387 n = ai[i+1] - ai[i]; 388 aj = a->j + ai[i]; 389 aa = a->a + ai[i]*4; 390 sum1 = 0.0; sum2 = 0.0; 391 for (j=0; j<n; j++) { 392 xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1]; 393 sum1 += aa[4*j]*x1 + aa[4*j+2]*x2; 394 sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2; 395 } 396 z[2*i] = sum1; z[2*i+1] = sum2; 397 } 398 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 399 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 400 return 0; 401 } 402 403 #undef __FUNCT__ 404 #define __FUNCT__ "MatMult_SeqBAIJ_2" 405 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 406 { 407 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 408 PetscScalar *z,x1,x2,sum1,sum2; 409 const PetscScalar *x,*xb; 410 const MatScalar *v; 411 PetscErrorCode ierr; 412 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 413 PetscBool usecprow=a->compressedrow.use; 414 415 PetscFunctionBegin; 416 417 if (usecprow) { 418 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 419 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 420 mbs = a->compressedrow.nrows; 421 ii = a->compressedrow.i; 422 ridx = a->compressedrow.rindex; 423 for (i=0; i<mbs; i++) { 424 n = ii[i+1] - ii[i]; 425 idx = a->j + ii[i]; 426 v = a->a + ii[i]*4; 427 sum1 = 0.0; sum2 = 0.0; 428 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 429 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 430 for (j=0; j<n; j++) { 431 xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1]; 432 sum1 += v[4*j]*x1 + v[4*j+2]*x2; 433 sum2 += v[4*j+1]*x1 + v[4*j+3]*x2; 434 } 435 z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2; 436 } 437 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 438 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 439 } else { 440 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr); 441 } 442 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 443 PetscFunctionReturn(0); 444 } 445 #else 446 #undef __FUNCT__ 447 #define __FUNCT__ "MatMult_SeqBAIJ_2" 448 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 449 { 450 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 451 PetscScalar *z = 0,sum1,sum2,*zarray; 452 const PetscScalar *x,*xb; 453 PetscScalar x1,x2; 454 const MatScalar *v; 455 PetscErrorCode ierr; 456 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 457 PetscBool usecprow=a->compressedrow.use; 458 459 PetscFunctionBegin; 460 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 461 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 462 463 idx = a->j; 464 v = a->a; 465 if (usecprow) { 466 mbs = a->compressedrow.nrows; 467 ii = a->compressedrow.i; 468 ridx = a->compressedrow.rindex; 469 } else { 470 mbs = a->mbs; 471 ii = a->i; 472 z = zarray; 473 } 474 475 for (i=0; i<mbs; i++) { 476 n = ii[1] - ii[0]; ii++; 477 sum1 = 0.0; sum2 = 0.0; 478 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 479 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 480 for (j=0; j<n; j++) { 481 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 482 sum1 += v[0]*x1 + v[2]*x2; 483 sum2 += v[1]*x1 + v[3]*x2; 484 v += 4; 485 } 486 if (usecprow) z = zarray + 2*ridx[i]; 487 z[0] = sum1; z[1] = sum2; 488 if (!usecprow) z += 2; 489 } 490 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 491 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 492 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 493 PetscFunctionReturn(0); 494 } 495 #endif 496 497 #if defined(PETSC_THREADCOMM_ACTIVE) 498 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 499 { 500 PetscErrorCode ierr; 501 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 502 PetscScalar *z,x1,x2,x3,sum1,sum2,sum3; 503 const PetscScalar *x,*xb; 504 const MatScalar *aa; 505 PetscInt *trstarts=A->rmap->trstarts; 506 PetscInt n,start,end,i,j; 507 const PetscInt *aj,*ai; 508 509 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 510 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 511 start = trstarts[thread_id] / 3; 512 end = trstarts[thread_id+1] / 3; 513 ai = a->i; 514 for (i=start; i<end; i++) { 515 n = ai[i+1] - ai[i]; 516 aj = a->j + ai[i]; 517 aa = a->a + ai[i]*9; 518 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 519 for (j=0; j<n; j++) { 520 xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 521 sum1 += aa[9*j]*x1 + aa[9*j+3]*x2 + aa[9*j+6]*x3; 522 sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3; 523 sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3; 524 } 525 z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3; 526 } 527 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 528 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 529 return 0; 530 } 531 532 #undef __FUNCT__ 533 #define __FUNCT__ "MatMult_SeqBAIJ_3" 534 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 535 { 536 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 537 PetscScalar *z,sum1,sum2,sum3,x1,x2,x3; 538 const PetscScalar *x,*xb; 539 const MatScalar *v; 540 PetscErrorCode ierr; 541 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 542 PetscBool usecprow=a->compressedrow.use; 543 544 545 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 546 #pragma disjoint(*v,*z,*xb) 547 #endif 548 549 PetscFunctionBegin; 550 551 if (usecprow) { 552 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 554 mbs = a->compressedrow.nrows; 555 ii = a->compressedrow.i; 556 ridx = a->compressedrow.rindex; 557 for (i=0; i<mbs; i++) { 558 n = ii[i+1] - ii[i]; 559 idx = a->j + ii[i]; 560 v = a->a + ii[i]*9; 561 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 562 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 563 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 564 for (j=0; j<n; j++) { 565 xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 566 sum1 += v[9*j]*x1 + v[9*j+3]*x2 + v[9*j+6]*x3; 567 sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3; 568 sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3; 569 } 570 z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3; 571 } 572 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 573 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 574 } else { 575 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr); 576 } 577 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 578 PetscFunctionReturn(0); 579 } 580 #else 581 #undef __FUNCT__ 582 #define __FUNCT__ "MatMult_SeqBAIJ_3" 583 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 584 { 585 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 586 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 587 const PetscScalar *x,*xb; 588 const MatScalar *v; 589 PetscErrorCode ierr; 590 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 591 PetscBool usecprow=a->compressedrow.use; 592 593 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 594 #pragma disjoint(*v,*z,*xb) 595 #endif 596 597 PetscFunctionBegin; 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 600 601 idx = a->j; 602 v = a->a; 603 if (usecprow) { 604 mbs = a->compressedrow.nrows; 605 ii = a->compressedrow.i; 606 ridx = a->compressedrow.rindex; 607 } else { 608 mbs = a->mbs; 609 ii = a->i; 610 z = zarray; 611 } 612 613 for (i=0; i<mbs; i++) { 614 n = ii[1] - ii[0]; ii++; 615 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 616 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 617 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 618 for (j=0; j<n; j++) { 619 xb = x + 3*(*idx++); 620 x1 = xb[0]; 621 x2 = xb[1]; 622 x3 = xb[2]; 623 624 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 625 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 626 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 627 v += 9; 628 } 629 if (usecprow) z = zarray + 3*ridx[i]; 630 z[0] = sum1; z[1] = sum2; z[2] = sum3; 631 if (!usecprow) z += 3; 632 } 633 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 634 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 635 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 636 PetscFunctionReturn(0); 637 } 638 #endif 639 640 #if defined(PETSC_THREADCOMM_ACTIVE) 641 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 642 { 643 PetscErrorCode ierr; 644 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 645 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 646 const PetscScalar *x,*xb; 647 const MatScalar *aa; 648 PetscInt *trstarts=A->rmap->trstarts; 649 PetscInt n,start,end,i,j; 650 const PetscInt *aj,*ai; 651 652 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 653 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 654 start = trstarts[thread_id] / 4; 655 end = trstarts[thread_id+1] / 4; 656 ai = a->i; 657 for (i=start; i<end; i++) { 658 n = ai[i+1] - ai[i]; 659 aj = a->j + ai[i]; 660 aa = a->a + ai[i]*16; 661 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 662 for (j=0; j<n; j++) { 663 xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 664 sum1 += aa[16*j]*x1 + aa[16*j+4]*x2 + aa[16*j+8]*x3 + aa[16*j+12]*x4; 665 sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3 + aa[16*j+13]*x4; 666 sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4; 667 sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4; 668 } 669 z[4*i] = sum1; z[4*i+1] = sum2; 670 z[4*i+2] = sum3; z[4*i+3] = sum4; 671 } 672 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 673 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 674 return 0; 675 } 676 677 #undef __FUNCT__ 678 #define __FUNCT__ "MatMult_SeqBAIJ_4" 679 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 680 { 681 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 682 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 683 const PetscScalar *x,*xb; 684 const MatScalar *v; 685 PetscErrorCode ierr; 686 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 687 PetscBool usecprow=a->compressedrow.use; 688 689 PetscFunctionBegin; 690 691 if (usecprow) { 692 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 693 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 694 mbs = a->compressedrow.nrows; 695 ii = a->compressedrow.i; 696 ridx = a->compressedrow.rindex; 697 for (i=0; i<mbs; i++) { 698 n = ii[i+1] - ii[1]; 699 idx = a->j + ii[i]; 700 v = a->a + ii[i]*16; 701 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 702 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 703 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 704 for (j=0; j<n; j++) { 705 xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 706 sum1 += v[16*j]*x1 + v[16*j+4]*x2 + v[16*j+8]*x3 + v[16*j+12]*x4; 707 sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3 + v[16*j+13]*x4; 708 sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4; 709 sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4; 710 } 711 z[4*ridx[i]] = sum1; z[4*ridx[i]+1] = sum2; 712 z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4; 713 } 714 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 715 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 716 } else { 717 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr); 718 } 719 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 720 PetscFunctionReturn(0); 721 } 722 #else 723 #undef __FUNCT__ 724 #define __FUNCT__ "MatMult_SeqBAIJ_4" 725 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 726 { 727 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 728 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 729 const PetscScalar *x,*xb; 730 const MatScalar *v; 731 PetscErrorCode ierr; 732 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 733 PetscBool usecprow=a->compressedrow.use; 734 735 PetscFunctionBegin; 736 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 737 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 738 739 idx = a->j; 740 v = a->a; 741 if (usecprow) { 742 mbs = a->compressedrow.nrows; 743 ii = a->compressedrow.i; 744 ridx = a->compressedrow.rindex; 745 } else { 746 mbs = a->mbs; 747 ii = a->i; 748 z = zarray; 749 } 750 751 for (i=0; i<mbs; i++) { 752 n = ii[1] - ii[0]; 753 ii++; 754 sum1 = 0.0; 755 sum2 = 0.0; 756 sum3 = 0.0; 757 sum4 = 0.0; 758 759 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 760 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 761 for (j=0; j<n; j++) { 762 xb = x + 4*(*idx++); 763 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 764 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 765 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 766 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 767 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 768 v += 16; 769 } 770 if (usecprow) z = zarray + 4*ridx[i]; 771 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 772 if (!usecprow) z += 4; 773 } 774 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 775 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 776 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 777 PetscFunctionReturn(0); 778 } 779 #endif 780 781 #if defined(PETSC_THREADCOMM_ACTIVE) 782 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 783 { 784 PetscErrorCode ierr; 785 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 786 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 787 const PetscScalar *x,*xb; 788 const MatScalar *aa; 789 PetscInt *trstarts=A->rmap->trstarts; 790 PetscInt n,start,end,i,j; 791 const PetscInt *aj,*ai; 792 793 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 794 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 795 start = trstarts[thread_id] / 5; 796 end = trstarts[thread_id+1] / 5; 797 ai = a->i; 798 for (i=start; i<end; i++) { 799 n = ai[i+1] - ai[i]; 800 aj = a->j + ai[i]; 801 aa = a->a + ai[i]*25; 802 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 803 for (j=0; j<n; j++) { 804 xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 805 sum1 += aa[25*j]*x1 + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5; 806 sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5; 807 sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5; 808 sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5; 809 sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5; 810 } 811 z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; 812 z[5*i+3] = sum4; z[5*i+4] = sum5; 813 } 814 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 815 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 816 return 0; 817 } 818 819 #undef __FUNCT__ 820 #define __FUNCT__ "MatMult_SeqBAIJ_5" 821 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 822 { 823 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 824 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 825 const PetscScalar *xb,*x; 826 const MatScalar *v; 827 PetscErrorCode ierr; 828 const PetscInt *idx,*ii,*ridx=NULL; 829 PetscInt mbs,i,j,n; 830 PetscBool usecprow=a->compressedrow.use; 831 832 PetscFunctionBegin; 833 834 if (usecprow) { 835 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 836 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 837 mbs = a->compressedrow.nrows; 838 ii = a->compressedrow.i; 839 ridx = a->compressedrow.rindex; 840 for (i=0; i<mbs; i++) { 841 n = ii[i+1] - ii[i]; 842 idx = a->j + ii[i]; 843 v = a->a + ii[i]*25; 844 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 845 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 846 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 847 for (j=0; j<n; j++) { 848 xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 849 sum1 += v[25*j]*x1 + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5; 850 sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5; 851 sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5; 852 sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5; 853 sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5; 854 } 855 z[5*ridx[i]] = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3; 856 z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5; 857 } 858 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 859 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 860 } else { 861 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr); 862 } 863 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 864 PetscFunctionReturn(0); 865 } 866 #else 867 #undef __FUNCT__ 868 #define __FUNCT__ "MatMult_SeqBAIJ_5" 869 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 870 { 871 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 872 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 873 const PetscScalar *xb,*x; 874 const MatScalar *v; 875 PetscErrorCode ierr; 876 const PetscInt *idx,*ii,*ridx=NULL; 877 PetscInt mbs,i,j,n; 878 PetscBool usecprow=a->compressedrow.use; 879 880 PetscFunctionBegin; 881 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 882 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 883 884 idx = a->j; 885 v = a->a; 886 if (usecprow) { 887 mbs = a->compressedrow.nrows; 888 ii = a->compressedrow.i; 889 ridx = a->compressedrow.rindex; 890 } else { 891 mbs = a->mbs; 892 ii = a->i; 893 z = zarray; 894 } 895 896 for (i=0; i<mbs; i++) { 897 n = ii[1] - ii[0]; ii++; 898 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 899 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 900 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 901 for (j=0; j<n; j++) { 902 xb = x + 5*(*idx++); 903 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 904 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 905 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 906 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 907 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 908 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 909 v += 25; 910 } 911 if (usecprow) z = zarray + 5*ridx[i]; 912 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 913 if (!usecprow) z += 5; 914 } 915 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 916 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 917 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 918 PetscFunctionReturn(0); 919 } 920 #endif 921 922 923 #if defined(PETSC_THREADCOMM_ACTIVE) 924 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 925 { 926 PetscErrorCode ierr; 927 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 928 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 929 const PetscScalar *x,*xb; 930 const MatScalar *aa; 931 PetscInt *trstarts=A->rmap->trstarts; 932 PetscInt n,start,end,i,j; 933 const PetscInt *aj,*ai; 934 935 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 936 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 937 start = trstarts[thread_id] / 6; 938 end = trstarts[thread_id+1] / 6; 939 ai = a->i; 940 for (i=start; i<end; i++) { 941 n = ai[i+1] - ai[i]; 942 aj = a->j + ai[i]; 943 aa = a->a + ai[i]*36; 944 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 945 for (j=0; j<n; j++) { 946 xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 947 sum1 += aa[36*j]*x1 + aa[36*j+6]*x2 + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6; 948 sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2 + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6; 949 sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2 + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6; 950 sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2 + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6; 951 sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6; 952 sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6; 953 } 954 z[6*i] = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3; 955 z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6; 956 } 957 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 958 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 959 return 0; 960 } 961 962 #undef __FUNCT__ 963 #define __FUNCT__ "MatMult_SeqBAIJ_6" 964 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 965 { 966 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 967 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 968 const PetscScalar *x,*xb; 969 const MatScalar *v; 970 PetscErrorCode ierr; 971 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 972 PetscBool usecprow=a->compressedrow.use; 973 974 PetscFunctionBegin; 975 976 if (usecprow) { 977 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 978 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 979 mbs = a->compressedrow.nrows; 980 ii = a->compressedrow.i; 981 ridx = a->compressedrow.rindex; 982 for (i=0; i<mbs; i++) { 983 n = ii[i+1] - ii[i]; 984 idx = a->j + ii[i]; 985 v = a->a + ii[i]*36; 986 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 987 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 988 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 989 for (j=0; j<n; j++) { 990 xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 991 sum1 += v[36*j]*x1 + v[36*j+6]*x2 + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6; 992 sum2 += v[36*j+1]*x1 + v[36*j+7]*x2 + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6; 993 sum3 += v[36*j+2]*x1 + v[36*j+8]*x2 + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6; 994 sum4 += v[36*j+3]*x1 + v[36*j+9]*x2 + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6; 995 sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6; 996 sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6; 997 } 998 z[6*ridx[i]] = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3; 999 z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6; 1000 } 1001 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1002 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1003 } else { 1004 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1005 } 1006 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1007 PetscFunctionReturn(0); 1008 } 1009 #else 1010 #undef __FUNCT__ 1011 #define __FUNCT__ "MatMult_SeqBAIJ_6" 1012 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 1013 { 1014 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1015 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1016 const PetscScalar *x,*xb; 1017 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 1018 const MatScalar *v; 1019 PetscErrorCode ierr; 1020 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1021 PetscBool usecprow=a->compressedrow.use; 1022 1023 PetscFunctionBegin; 1024 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1025 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1026 1027 idx = a->j; 1028 v = a->a; 1029 if (usecprow) { 1030 mbs = a->compressedrow.nrows; 1031 ii = a->compressedrow.i; 1032 ridx = a->compressedrow.rindex; 1033 } else { 1034 mbs = a->mbs; 1035 ii = a->i; 1036 z = zarray; 1037 } 1038 1039 for (i=0; i<mbs; i++) { 1040 n = ii[1] - ii[0]; 1041 ii++; 1042 sum1 = 0.0; 1043 sum2 = 0.0; 1044 sum3 = 0.0; 1045 sum4 = 0.0; 1046 sum5 = 0.0; 1047 sum6 = 0.0; 1048 1049 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1050 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1051 for (j=0; j<n; j++) { 1052 xb = x + 6*(*idx++); 1053 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1054 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1055 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1056 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1057 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1058 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1059 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1060 v += 36; 1061 } 1062 if (usecprow) z = zarray + 6*ridx[i]; 1063 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1064 if (!usecprow) z += 6; 1065 } 1066 1067 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1068 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1069 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1070 PetscFunctionReturn(0); 1071 } 1072 #endif 1073 1074 #if defined(PETSC_THREADCOMM_ACTIVE) 1075 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 1076 { 1077 PetscErrorCode ierr; 1078 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1079 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1080 const PetscScalar *x,*xb; 1081 const MatScalar *aa; 1082 PetscInt *trstarts=A->rmap->trstarts; 1083 PetscInt n,start,end,i,j; 1084 const PetscInt *aj,*ai; 1085 1086 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1087 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1088 start = trstarts[thread_id] / 7; 1089 end = trstarts[thread_id+1] / 7; 1090 ai = a->i; 1091 for (i=start; i<end; i++) { 1092 n = ai[i+1] - ai[i]; 1093 aj = a->j + ai[i]; 1094 aa = a->a + ai[i]*49; 1095 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1096 for (j=0; j<n; j++) { 1097 xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1098 sum1 += aa[49*j]*x1 + aa[49*j+7]*x2 + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7; 1099 sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2 + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7; 1100 sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2 + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7; 1101 sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7; 1102 sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7; 1103 sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7; 1104 sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7; 1105 } 1106 z[7*i] = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4; 1107 z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7; 1108 } 1109 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1110 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1111 return 0; 1112 } 1113 1114 #undef __FUNCT__ 1115 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1116 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1117 { 1118 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1119 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1120 const PetscScalar *x,*xb; 1121 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1122 const MatScalar *v; 1123 PetscErrorCode ierr; 1124 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1125 PetscBool usecprow=a->compressedrow.use; 1126 1127 PetscFunctionBegin; 1128 1129 if (usecprow) { 1130 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1131 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1132 mbs = a->compressedrow.nrows; 1133 ii = a->compressedrow.i; 1134 ridx = a->compressedrow.rindex; 1135 for (i=0; i<mbs; i++) { 1136 n = ii[i+1] - ii[i]; 1137 idx = a->j + ii[i]; 1138 v = a->a + ii[i]*49; 1139 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1140 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1141 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1142 for (j=0; j<n; j++) { 1143 xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1144 sum1 += v[49*j]*x1 + v[49*j+7]*x2 + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7; 1145 sum2 += v[49*j+1]*x1 + v[49*j+8]*x2 + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7; 1146 sum3 += v[49*j+2]*x1 + v[49*j+9]*x2 + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7; 1147 sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7; 1148 sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7; 1149 sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7; 1150 sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7; 1151 } 1152 z[7*ridx[i]] = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4; 1153 z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7; 1154 } 1155 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1156 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1157 } else { 1158 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1159 } 1160 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1161 PetscFunctionReturn(0); 1162 } 1163 #else 1164 #undef __FUNCT__ 1165 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1166 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1167 { 1168 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1169 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1170 const PetscScalar *x,*xb; 1171 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1172 const MatScalar *v; 1173 PetscErrorCode ierr; 1174 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1175 PetscBool usecprow=a->compressedrow.use; 1176 1177 PetscFunctionBegin; 1178 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1179 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1180 1181 idx = a->j; 1182 v = a->a; 1183 if (usecprow) { 1184 mbs = a->compressedrow.nrows; 1185 ii = a->compressedrow.i; 1186 ridx = a->compressedrow.rindex; 1187 } else { 1188 mbs = a->mbs; 1189 ii = a->i; 1190 z = zarray; 1191 } 1192 1193 for (i=0; i<mbs; i++) { 1194 n = ii[1] - ii[0]; 1195 ii++; 1196 sum1 = 0.0; 1197 sum2 = 0.0; 1198 sum3 = 0.0; 1199 sum4 = 0.0; 1200 sum5 = 0.0; 1201 sum6 = 0.0; 1202 sum7 = 0.0; 1203 1204 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1205 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1206 for (j=0; j<n; j++) { 1207 xb = x + 7*(*idx++); 1208 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1209 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1210 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1211 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1212 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1213 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1214 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1215 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1216 v += 49; 1217 } 1218 if (usecprow) z = zarray + 7*ridx[i]; 1219 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1220 if (!usecprow) z += 7; 1221 } 1222 1223 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1224 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1225 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1226 PetscFunctionReturn(0); 1227 } 1228 #endif 1229 1230 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1231 /* Default MatMult for block size 15 */ 1232 1233 #undef __FUNCT__ 1234 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 1235 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1236 { 1237 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1238 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1239 const PetscScalar *x,*xb; 1240 PetscScalar *zarray,xv; 1241 const MatScalar *v; 1242 PetscErrorCode ierr; 1243 const PetscInt *ii,*ij=a->j,*idx; 1244 PetscInt mbs,i,j,k,n,*ridx=NULL; 1245 PetscBool usecprow=a->compressedrow.use; 1246 1247 PetscFunctionBegin; 1248 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1249 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1250 1251 v = a->a; 1252 if (usecprow) { 1253 mbs = a->compressedrow.nrows; 1254 ii = a->compressedrow.i; 1255 ridx = a->compressedrow.rindex; 1256 } else { 1257 mbs = a->mbs; 1258 ii = a->i; 1259 z = zarray; 1260 } 1261 1262 for (i=0; i<mbs; i++) { 1263 n = ii[i+1] - ii[i]; 1264 idx = ij + ii[i]; 1265 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1266 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1267 1268 for (j=0; j<n; j++) { 1269 xb = x + 15*(idx[j]); 1270 1271 for (k=0; k<15; k++) { 1272 xv = xb[k]; 1273 sum1 += v[0]*xv; 1274 sum2 += v[1]*xv; 1275 sum3 += v[2]*xv; 1276 sum4 += v[3]*xv; 1277 sum5 += v[4]*xv; 1278 sum6 += v[5]*xv; 1279 sum7 += v[6]*xv; 1280 sum8 += v[7]*xv; 1281 sum9 += v[8]*xv; 1282 sum10 += v[9]*xv; 1283 sum11 += v[10]*xv; 1284 sum12 += v[11]*xv; 1285 sum13 += v[12]*xv; 1286 sum14 += v[13]*xv; 1287 sum15 += v[14]*xv; 1288 v += 15; 1289 } 1290 } 1291 if (usecprow) z = zarray + 15*ridx[i]; 1292 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1293 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1294 1295 if (!usecprow) z += 15; 1296 } 1297 1298 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1299 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1300 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1301 PetscFunctionReturn(0); 1302 } 1303 1304 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1305 #undef __FUNCT__ 1306 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 1307 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1308 { 1309 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1310 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1311 const PetscScalar *x,*xb; 1312 PetscScalar x1,x2,x3,x4,*zarray; 1313 const MatScalar *v; 1314 PetscErrorCode ierr; 1315 const PetscInt *ii,*ij=a->j,*idx; 1316 PetscInt mbs,i,j,n,*ridx=NULL; 1317 PetscBool usecprow=a->compressedrow.use; 1318 1319 PetscFunctionBegin; 1320 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1321 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1322 1323 v = a->a; 1324 if (usecprow) { 1325 mbs = a->compressedrow.nrows; 1326 ii = a->compressedrow.i; 1327 ridx = a->compressedrow.rindex; 1328 } else { 1329 mbs = a->mbs; 1330 ii = a->i; 1331 z = zarray; 1332 } 1333 1334 for (i=0; i<mbs; i++) { 1335 n = ii[i+1] - ii[i]; 1336 idx = ij + ii[i]; 1337 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1338 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1339 1340 for (j=0; j<n; j++) { 1341 xb = x + 15*(idx[j]); 1342 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1343 1344 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1345 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1346 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1347 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1348 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1349 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1350 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1351 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1352 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1353 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1354 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1355 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1356 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1357 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1358 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1359 1360 v += 60; 1361 1362 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1363 1364 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1365 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1366 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1367 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1368 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1369 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1370 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1371 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1372 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1373 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1374 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1375 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1376 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1377 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1378 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1379 v += 60; 1380 1381 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1382 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1383 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1384 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1385 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1386 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1387 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1388 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1389 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1390 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1391 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1392 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1393 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1394 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1395 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1396 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1397 v += 60; 1398 1399 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1400 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1401 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1402 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1403 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1404 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1405 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1406 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1407 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1408 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1409 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1410 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1411 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1412 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1413 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1414 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1415 v += 45; 1416 } 1417 if (usecprow) z = zarray + 15*ridx[i]; 1418 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1419 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1420 1421 if (!usecprow) z += 15; 1422 } 1423 1424 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1425 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1426 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1427 PetscFunctionReturn(0); 1428 } 1429 1430 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1431 #undef __FUNCT__ 1432 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 1433 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1434 { 1435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1436 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1437 const PetscScalar *x,*xb; 1438 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1439 const MatScalar *v; 1440 PetscErrorCode ierr; 1441 const PetscInt *ii,*ij=a->j,*idx; 1442 PetscInt mbs,i,j,n,*ridx=NULL; 1443 PetscBool usecprow=a->compressedrow.use; 1444 1445 PetscFunctionBegin; 1446 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1447 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1448 1449 v = a->a; 1450 if (usecprow) { 1451 mbs = a->compressedrow.nrows; 1452 ii = a->compressedrow.i; 1453 ridx = a->compressedrow.rindex; 1454 } else { 1455 mbs = a->mbs; 1456 ii = a->i; 1457 z = zarray; 1458 } 1459 1460 for (i=0; i<mbs; i++) { 1461 n = ii[i+1] - ii[i]; 1462 idx = ij + ii[i]; 1463 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1464 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1465 1466 for (j=0; j<n; j++) { 1467 xb = x + 15*(idx[j]); 1468 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1469 x8 = xb[7]; 1470 1471 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1472 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1473 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1474 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1475 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1476 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1477 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1478 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1479 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1480 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1481 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1482 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1483 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1484 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1485 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1486 v += 120; 1487 1488 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1489 1490 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1491 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1492 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1493 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1494 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1495 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1496 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1497 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1498 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1499 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1500 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1501 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1502 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1503 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1504 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1505 v += 105; 1506 } 1507 if (usecprow) z = zarray + 15*ridx[i]; 1508 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1509 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1510 1511 if (!usecprow) z += 15; 1512 } 1513 1514 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1515 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1516 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1517 PetscFunctionReturn(0); 1518 } 1519 1520 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1521 1522 #undef __FUNCT__ 1523 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 1524 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1525 { 1526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1527 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1528 const PetscScalar *x,*xb; 1529 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1530 const MatScalar *v; 1531 PetscErrorCode ierr; 1532 const PetscInt *ii,*ij=a->j,*idx; 1533 PetscInt mbs,i,j,n,*ridx=NULL; 1534 PetscBool usecprow=a->compressedrow.use; 1535 1536 PetscFunctionBegin; 1537 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1538 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1539 1540 v = a->a; 1541 if (usecprow) { 1542 mbs = a->compressedrow.nrows; 1543 ii = a->compressedrow.i; 1544 ridx = a->compressedrow.rindex; 1545 } else { 1546 mbs = a->mbs; 1547 ii = a->i; 1548 z = zarray; 1549 } 1550 1551 for (i=0; i<mbs; i++) { 1552 n = ii[i+1] - ii[i]; 1553 idx = ij + ii[i]; 1554 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1555 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1556 1557 for (j=0; j<n; j++) { 1558 xb = x + 15*(idx[j]); 1559 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1560 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1561 1562 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1563 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1564 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1565 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1566 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1567 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1568 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1569 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1570 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1571 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1572 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1573 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1574 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1575 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1576 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1577 v += 225; 1578 } 1579 if (usecprow) z = zarray + 15*ridx[i]; 1580 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1581 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1582 1583 if (!usecprow) z += 15; 1584 } 1585 1586 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1587 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1588 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1589 PetscFunctionReturn(0); 1590 } 1591 1592 1593 /* 1594 This will not work with MatScalar == float because it calls the BLAS 1595 */ 1596 #undef __FUNCT__ 1597 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1598 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1599 { 1600 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1601 PetscScalar *z = 0,*work,*workt,*zarray; 1602 const PetscScalar *x,*xb; 1603 const MatScalar *v; 1604 PetscErrorCode ierr; 1605 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1606 const PetscInt *idx,*ii,*ridx=NULL; 1607 PetscInt ncols,k; 1608 PetscBool usecprow=a->compressedrow.use; 1609 1610 PetscFunctionBegin; 1611 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1612 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1613 1614 idx = a->j; 1615 v = a->a; 1616 if (usecprow) { 1617 mbs = a->compressedrow.nrows; 1618 ii = a->compressedrow.i; 1619 ridx = a->compressedrow.rindex; 1620 } else { 1621 mbs = a->mbs; 1622 ii = a->i; 1623 z = zarray; 1624 } 1625 1626 if (!a->mult_work) { 1627 k = PetscMax(A->rmap->n,A->cmap->n); 1628 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1629 } 1630 work = a->mult_work; 1631 for (i=0; i<mbs; i++) { 1632 n = ii[1] - ii[0]; ii++; 1633 ncols = n*bs; 1634 workt = work; 1635 for (j=0; j<n; j++) { 1636 xb = x + bs*(*idx++); 1637 for (k=0; k<bs; k++) workt[k] = xb[k]; 1638 workt += bs; 1639 } 1640 if (usecprow) z = zarray + bs*ridx[i]; 1641 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1642 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1643 v += n*bs2; 1644 if (!usecprow) z += bs; 1645 } 1646 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1647 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1648 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1649 PetscFunctionReturn(0); 1650 } 1651 1652 #undef __FUNCT__ 1653 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1654 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1655 { 1656 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1657 const PetscScalar *x; 1658 PetscScalar *y,*z,sum; 1659 const MatScalar *v; 1660 PetscErrorCode ierr; 1661 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1662 const PetscInt *idx,*ii; 1663 PetscBool usecprow=a->compressedrow.use; 1664 1665 PetscFunctionBegin; 1666 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1667 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1668 1669 idx = a->j; 1670 v = a->a; 1671 if (usecprow) { 1672 if (zz != yy) { 1673 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1674 } 1675 mbs = a->compressedrow.nrows; 1676 ii = a->compressedrow.i; 1677 ridx = a->compressedrow.rindex; 1678 } else { 1679 ii = a->i; 1680 } 1681 1682 for (i=0; i<mbs; i++) { 1683 n = ii[1] - ii[0]; 1684 ii++; 1685 if (!usecprow) { 1686 sum = y[i]; 1687 } else { 1688 sum = y[ridx[i]]; 1689 } 1690 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1691 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1692 PetscSparseDensePlusDot(sum,x,v,idx,n); 1693 v += n; 1694 idx += n; 1695 if (usecprow) { 1696 z[ridx[i]] = sum; 1697 } else { 1698 z[i] = sum; 1699 } 1700 } 1701 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1702 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1703 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706 1707 #undef __FUNCT__ 1708 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1709 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1710 { 1711 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1712 PetscScalar *y = 0,*z = 0,sum1,sum2; 1713 const PetscScalar *x,*xb; 1714 PetscScalar x1,x2,*yarray,*zarray; 1715 const MatScalar *v; 1716 PetscErrorCode ierr; 1717 PetscInt mbs = a->mbs,i,n,j; 1718 const PetscInt *idx,*ii,*ridx = NULL; 1719 PetscBool usecprow = a->compressedrow.use; 1720 1721 PetscFunctionBegin; 1722 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1723 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1724 1725 idx = a->j; 1726 v = a->a; 1727 if (usecprow) { 1728 if (zz != yy) { 1729 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1730 } 1731 mbs = a->compressedrow.nrows; 1732 ii = a->compressedrow.i; 1733 ridx = a->compressedrow.rindex; 1734 if (zz != yy) { 1735 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1736 } 1737 } else { 1738 ii = a->i; 1739 y = yarray; 1740 z = zarray; 1741 } 1742 1743 for (i=0; i<mbs; i++) { 1744 n = ii[1] - ii[0]; ii++; 1745 if (usecprow) { 1746 z = zarray + 2*ridx[i]; 1747 y = yarray + 2*ridx[i]; 1748 } 1749 sum1 = y[0]; sum2 = y[1]; 1750 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1751 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1752 for (j=0; j<n; j++) { 1753 xb = x + 2*(*idx++); 1754 x1 = xb[0]; 1755 x2 = xb[1]; 1756 1757 sum1 += v[0]*x1 + v[2]*x2; 1758 sum2 += v[1]*x1 + v[3]*x2; 1759 v += 4; 1760 } 1761 z[0] = sum1; z[1] = sum2; 1762 if (!usecprow) { 1763 z += 2; y += 2; 1764 } 1765 } 1766 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1767 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1768 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1769 PetscFunctionReturn(0); 1770 } 1771 1772 #undef __FUNCT__ 1773 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1774 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1775 { 1776 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1777 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1778 const PetscScalar *x,*xb; 1779 const MatScalar *v; 1780 PetscErrorCode ierr; 1781 PetscInt mbs = a->mbs,i,j,n; 1782 const PetscInt *idx,*ii,*ridx = NULL; 1783 PetscBool usecprow = a->compressedrow.use; 1784 1785 PetscFunctionBegin; 1786 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1787 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1788 1789 idx = a->j; 1790 v = a->a; 1791 if (usecprow) { 1792 if (zz != yy) { 1793 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1794 } 1795 mbs = a->compressedrow.nrows; 1796 ii = a->compressedrow.i; 1797 ridx = a->compressedrow.rindex; 1798 } else { 1799 ii = a->i; 1800 y = yarray; 1801 z = zarray; 1802 } 1803 1804 for (i=0; i<mbs; i++) { 1805 n = ii[1] - ii[0]; ii++; 1806 if (usecprow) { 1807 z = zarray + 3*ridx[i]; 1808 y = yarray + 3*ridx[i]; 1809 } 1810 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1811 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1812 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1813 for (j=0; j<n; j++) { 1814 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1815 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1816 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1817 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1818 v += 9; 1819 } 1820 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1821 if (!usecprow) { 1822 z += 3; y += 3; 1823 } 1824 } 1825 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1826 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1827 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1828 PetscFunctionReturn(0); 1829 } 1830 1831 #undef __FUNCT__ 1832 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1833 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1834 { 1835 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1836 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1837 const PetscScalar *x,*xb; 1838 const MatScalar *v; 1839 PetscErrorCode ierr; 1840 PetscInt mbs = a->mbs,i,j,n; 1841 const PetscInt *idx,*ii,*ridx=NULL; 1842 PetscBool usecprow=a->compressedrow.use; 1843 1844 PetscFunctionBegin; 1845 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1846 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1847 1848 idx = a->j; 1849 v = a->a; 1850 if (usecprow) { 1851 if (zz != yy) { 1852 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1853 } 1854 mbs = a->compressedrow.nrows; 1855 ii = a->compressedrow.i; 1856 ridx = a->compressedrow.rindex; 1857 } else { 1858 ii = a->i; 1859 y = yarray; 1860 z = zarray; 1861 } 1862 1863 for (i=0; i<mbs; i++) { 1864 n = ii[1] - ii[0]; ii++; 1865 if (usecprow) { 1866 z = zarray + 4*ridx[i]; 1867 y = yarray + 4*ridx[i]; 1868 } 1869 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1870 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1871 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1872 for (j=0; j<n; j++) { 1873 xb = x + 4*(*idx++); 1874 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1875 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1876 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1877 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1878 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1879 v += 16; 1880 } 1881 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1882 if (!usecprow) { 1883 z += 4; y += 4; 1884 } 1885 } 1886 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1887 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1888 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1889 PetscFunctionReturn(0); 1890 } 1891 1892 #undef __FUNCT__ 1893 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1894 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1895 { 1896 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1897 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1898 const PetscScalar *x,*xb; 1899 PetscScalar *yarray,*zarray; 1900 const MatScalar *v; 1901 PetscErrorCode ierr; 1902 PetscInt mbs = a->mbs,i,j,n; 1903 const PetscInt *idx,*ii,*ridx = NULL; 1904 PetscBool usecprow=a->compressedrow.use; 1905 1906 PetscFunctionBegin; 1907 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1908 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1909 1910 idx = a->j; 1911 v = a->a; 1912 if (usecprow) { 1913 if (zz != yy) { 1914 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1915 } 1916 mbs = a->compressedrow.nrows; 1917 ii = a->compressedrow.i; 1918 ridx = a->compressedrow.rindex; 1919 } else { 1920 ii = a->i; 1921 y = yarray; 1922 z = zarray; 1923 } 1924 1925 for (i=0; i<mbs; i++) { 1926 n = ii[1] - ii[0]; ii++; 1927 if (usecprow) { 1928 z = zarray + 5*ridx[i]; 1929 y = yarray + 5*ridx[i]; 1930 } 1931 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1932 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1933 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1934 for (j=0; j<n; j++) { 1935 xb = x + 5*(*idx++); 1936 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1937 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1938 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1939 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1940 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1941 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1942 v += 25; 1943 } 1944 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1945 if (!usecprow) { 1946 z += 5; y += 5; 1947 } 1948 } 1949 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1950 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1951 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1952 PetscFunctionReturn(0); 1953 } 1954 #undef __FUNCT__ 1955 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1956 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1957 { 1958 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1959 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1960 const PetscScalar *x,*xb; 1961 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1962 const MatScalar *v; 1963 PetscErrorCode ierr; 1964 PetscInt mbs = a->mbs,i,j,n; 1965 const PetscInt *idx,*ii,*ridx=NULL; 1966 PetscBool usecprow=a->compressedrow.use; 1967 1968 PetscFunctionBegin; 1969 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1970 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1971 1972 idx = a->j; 1973 v = a->a; 1974 if (usecprow) { 1975 if (zz != yy) { 1976 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1977 } 1978 mbs = a->compressedrow.nrows; 1979 ii = a->compressedrow.i; 1980 ridx = a->compressedrow.rindex; 1981 } else { 1982 ii = a->i; 1983 y = yarray; 1984 z = zarray; 1985 } 1986 1987 for (i=0; i<mbs; i++) { 1988 n = ii[1] - ii[0]; ii++; 1989 if (usecprow) { 1990 z = zarray + 6*ridx[i]; 1991 y = yarray + 6*ridx[i]; 1992 } 1993 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1994 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1995 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1996 for (j=0; j<n; j++) { 1997 xb = x + 6*(*idx++); 1998 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1999 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2000 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2001 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2002 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2003 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2004 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2005 v += 36; 2006 } 2007 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2008 if (!usecprow) { 2009 z += 6; y += 6; 2010 } 2011 } 2012 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2013 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2014 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2015 PetscFunctionReturn(0); 2016 } 2017 2018 #undef __FUNCT__ 2019 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 2020 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2021 { 2022 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2023 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2024 const PetscScalar *x,*xb; 2025 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2026 const MatScalar *v; 2027 PetscErrorCode ierr; 2028 PetscInt mbs = a->mbs,i,j,n; 2029 const PetscInt *idx,*ii,*ridx = NULL; 2030 PetscBool usecprow=a->compressedrow.use; 2031 2032 PetscFunctionBegin; 2033 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2034 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2035 2036 idx = a->j; 2037 v = a->a; 2038 if (usecprow) { 2039 if (zz != yy) { 2040 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2041 } 2042 mbs = a->compressedrow.nrows; 2043 ii = a->compressedrow.i; 2044 ridx = a->compressedrow.rindex; 2045 } else { 2046 ii = a->i; 2047 y = yarray; 2048 z = zarray; 2049 } 2050 2051 for (i=0; i<mbs; i++) { 2052 n = ii[1] - ii[0]; ii++; 2053 if (usecprow) { 2054 z = zarray + 7*ridx[i]; 2055 y = yarray + 7*ridx[i]; 2056 } 2057 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2058 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2059 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2060 for (j=0; j<n; j++) { 2061 xb = x + 7*(*idx++); 2062 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2063 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2064 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2065 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2066 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2067 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2068 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2069 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2070 v += 49; 2071 } 2072 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2073 if (!usecprow) { 2074 z += 7; y += 7; 2075 } 2076 } 2077 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2078 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2079 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2080 PetscFunctionReturn(0); 2081 } 2082 2083 #undef __FUNCT__ 2084 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 2085 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2086 { 2087 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2088 PetscScalar *z = 0,*work,*workt,*zarray; 2089 const PetscScalar *x,*xb; 2090 const MatScalar *v; 2091 PetscErrorCode ierr; 2092 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2093 PetscInt ncols,k; 2094 const PetscInt *ridx = NULL,*idx,*ii; 2095 PetscBool usecprow = a->compressedrow.use; 2096 2097 PetscFunctionBegin; 2098 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2099 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2100 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2101 2102 idx = a->j; 2103 v = a->a; 2104 if (usecprow) { 2105 mbs = a->compressedrow.nrows; 2106 ii = a->compressedrow.i; 2107 ridx = a->compressedrow.rindex; 2108 } else { 2109 mbs = a->mbs; 2110 ii = a->i; 2111 z = zarray; 2112 } 2113 2114 if (!a->mult_work) { 2115 k = PetscMax(A->rmap->n,A->cmap->n); 2116 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2117 } 2118 work = a->mult_work; 2119 for (i=0; i<mbs; i++) { 2120 n = ii[1] - ii[0]; ii++; 2121 ncols = n*bs; 2122 workt = work; 2123 for (j=0; j<n; j++) { 2124 xb = x + bs*(*idx++); 2125 for (k=0; k<bs; k++) workt[k] = xb[k]; 2126 workt += bs; 2127 } 2128 if (usecprow) z = zarray + bs*ridx[i]; 2129 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2130 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 2131 v += n*bs2; 2132 if (!usecprow) z += bs; 2133 } 2134 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2135 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2136 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2137 PetscFunctionReturn(0); 2138 } 2139 2140 #undef __FUNCT__ 2141 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 2142 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2143 { 2144 PetscScalar zero = 0.0; 2145 PetscErrorCode ierr; 2146 2147 PetscFunctionBegin; 2148 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2149 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2150 PetscFunctionReturn(0); 2151 } 2152 2153 #undef __FUNCT__ 2154 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 2155 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2156 { 2157 PetscScalar zero = 0.0; 2158 PetscErrorCode ierr; 2159 2160 PetscFunctionBegin; 2161 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2162 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2163 PetscFunctionReturn(0); 2164 } 2165 2166 #undef __FUNCT__ 2167 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 2168 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2169 { 2170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2171 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2172 const PetscScalar *x,*xb = NULL; 2173 const MatScalar *v; 2174 PetscErrorCode ierr; 2175 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2176 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2177 Mat_CompressedRow cprow = a->compressedrow; 2178 PetscBool usecprow = cprow.use; 2179 2180 PetscFunctionBegin; 2181 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2182 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2183 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2184 2185 idx = a->j; 2186 v = a->a; 2187 if (usecprow) { 2188 mbs = cprow.nrows; 2189 ii = cprow.i; 2190 ridx = cprow.rindex; 2191 } else { 2192 mbs=a->mbs; 2193 ii = a->i; 2194 xb = x; 2195 } 2196 2197 switch (bs) { 2198 case 1: 2199 for (i=0; i<mbs; i++) { 2200 if (usecprow) xb = x + ridx[i]; 2201 x1 = xb[0]; 2202 ib = idx + ii[0]; 2203 n = ii[1] - ii[0]; ii++; 2204 for (j=0; j<n; j++) { 2205 rval = ib[j]; 2206 z[rval] += PetscConj(*v) * x1; 2207 v++; 2208 } 2209 if (!usecprow) xb++; 2210 } 2211 break; 2212 case 2: 2213 for (i=0; i<mbs; i++) { 2214 if (usecprow) xb = x + 2*ridx[i]; 2215 x1 = xb[0]; x2 = xb[1]; 2216 ib = idx + ii[0]; 2217 n = ii[1] - ii[0]; ii++; 2218 for (j=0; j<n; j++) { 2219 rval = ib[j]*2; 2220 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2221 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2222 v += 4; 2223 } 2224 if (!usecprow) xb += 2; 2225 } 2226 break; 2227 case 3: 2228 for (i=0; i<mbs; i++) { 2229 if (usecprow) xb = x + 3*ridx[i]; 2230 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2231 ib = idx + ii[0]; 2232 n = ii[1] - ii[0]; ii++; 2233 for (j=0; j<n; j++) { 2234 rval = ib[j]*3; 2235 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2236 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2237 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2238 v += 9; 2239 } 2240 if (!usecprow) xb += 3; 2241 } 2242 break; 2243 case 4: 2244 for (i=0; i<mbs; i++) { 2245 if (usecprow) xb = x + 4*ridx[i]; 2246 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2247 ib = idx + ii[0]; 2248 n = ii[1] - ii[0]; ii++; 2249 for (j=0; j<n; j++) { 2250 rval = ib[j]*4; 2251 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2252 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2253 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2254 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2255 v += 16; 2256 } 2257 if (!usecprow) xb += 4; 2258 } 2259 break; 2260 case 5: 2261 for (i=0; i<mbs; i++) { 2262 if (usecprow) xb = x + 5*ridx[i]; 2263 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2264 x4 = xb[3]; x5 = xb[4]; 2265 ib = idx + ii[0]; 2266 n = ii[1] - ii[0]; ii++; 2267 for (j=0; j<n; j++) { 2268 rval = ib[j]*5; 2269 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2270 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2271 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2272 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2273 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2274 v += 25; 2275 } 2276 if (!usecprow) xb += 5; 2277 } 2278 break; 2279 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 2280 PetscInt ncols,k; 2281 PetscScalar *work,*workt; 2282 const PetscScalar *xtmp; 2283 2284 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2285 if (!a->mult_work) { 2286 k = PetscMax(A->rmap->n,A->cmap->n); 2287 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2288 } 2289 work = a->mult_work; 2290 xtmp = x; 2291 for (i=0; i<mbs; i++) { 2292 n = ii[1] - ii[0]; ii++; 2293 ncols = n*bs; 2294 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2295 if (usecprow) xtmp = x + bs*ridx[i]; 2296 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2297 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2298 v += n*bs2; 2299 if (!usecprow) xtmp += bs; 2300 workt = work; 2301 for (j=0; j<n; j++) { 2302 zb = z + bs*(*idx++); 2303 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2304 workt += bs; 2305 } 2306 } 2307 } 2308 } 2309 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2310 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2311 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2312 PetscFunctionReturn(0); 2313 } 2314 2315 #undef __FUNCT__ 2316 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 2317 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2318 { 2319 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2320 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2321 const PetscScalar *x,*xb = 0; 2322 const MatScalar *v; 2323 PetscErrorCode ierr; 2324 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2325 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2326 Mat_CompressedRow cprow = a->compressedrow; 2327 PetscBool usecprow=cprow.use; 2328 2329 PetscFunctionBegin; 2330 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2331 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2332 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2333 2334 idx = a->j; 2335 v = a->a; 2336 if (usecprow) { 2337 mbs = cprow.nrows; 2338 ii = cprow.i; 2339 ridx = cprow.rindex; 2340 } else { 2341 mbs=a->mbs; 2342 ii = a->i; 2343 xb = x; 2344 } 2345 2346 switch (bs) { 2347 case 1: 2348 for (i=0; i<mbs; i++) { 2349 if (usecprow) xb = x + ridx[i]; 2350 x1 = xb[0]; 2351 ib = idx + ii[0]; 2352 n = ii[1] - ii[0]; ii++; 2353 for (j=0; j<n; j++) { 2354 rval = ib[j]; 2355 z[rval] += *v * x1; 2356 v++; 2357 } 2358 if (!usecprow) xb++; 2359 } 2360 break; 2361 case 2: 2362 for (i=0; i<mbs; i++) { 2363 if (usecprow) xb = x + 2*ridx[i]; 2364 x1 = xb[0]; x2 = xb[1]; 2365 ib = idx + ii[0]; 2366 n = ii[1] - ii[0]; ii++; 2367 for (j=0; j<n; j++) { 2368 rval = ib[j]*2; 2369 z[rval++] += v[0]*x1 + v[1]*x2; 2370 z[rval++] += v[2]*x1 + v[3]*x2; 2371 v += 4; 2372 } 2373 if (!usecprow) xb += 2; 2374 } 2375 break; 2376 case 3: 2377 for (i=0; i<mbs; i++) { 2378 if (usecprow) xb = x + 3*ridx[i]; 2379 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2380 ib = idx + ii[0]; 2381 n = ii[1] - ii[0]; ii++; 2382 for (j=0; j<n; j++) { 2383 rval = ib[j]*3; 2384 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2385 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2386 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2387 v += 9; 2388 } 2389 if (!usecprow) xb += 3; 2390 } 2391 break; 2392 case 4: 2393 for (i=0; i<mbs; i++) { 2394 if (usecprow) xb = x + 4*ridx[i]; 2395 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2396 ib = idx + ii[0]; 2397 n = ii[1] - ii[0]; ii++; 2398 for (j=0; j<n; j++) { 2399 rval = ib[j]*4; 2400 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2401 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2402 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2403 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2404 v += 16; 2405 } 2406 if (!usecprow) xb += 4; 2407 } 2408 break; 2409 case 5: 2410 for (i=0; i<mbs; i++) { 2411 if (usecprow) xb = x + 5*ridx[i]; 2412 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2413 x4 = xb[3]; x5 = xb[4]; 2414 ib = idx + ii[0]; 2415 n = ii[1] - ii[0]; ii++; 2416 for (j=0; j<n; j++) { 2417 rval = ib[j]*5; 2418 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2419 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2420 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2421 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2422 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2423 v += 25; 2424 } 2425 if (!usecprow) xb += 5; 2426 } 2427 break; 2428 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2429 PetscInt ncols,k; 2430 PetscScalar *work,*workt; 2431 const PetscScalar *xtmp; 2432 if (!a->mult_work) { 2433 k = PetscMax(A->rmap->n,A->cmap->n); 2434 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2435 } 2436 work = a->mult_work; 2437 xtmp = x; 2438 for (i=0; i<mbs; i++) { 2439 n = ii[1] - ii[0]; ii++; 2440 ncols = n*bs; 2441 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2442 if (usecprow) xtmp = x + bs*ridx[i]; 2443 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2444 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2445 v += n*bs2; 2446 if (!usecprow) xtmp += bs; 2447 workt = work; 2448 for (j=0; j<n; j++) { 2449 zb = z + bs*(*idx++); 2450 for (k=0; k<bs; k++) zb[k] += workt[k]; 2451 workt += bs; 2452 } 2453 } 2454 } 2455 } 2456 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2457 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2458 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2459 PetscFunctionReturn(0); 2460 } 2461 2462 #undef __FUNCT__ 2463 #define __FUNCT__ "MatScale_SeqBAIJ" 2464 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2465 { 2466 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2467 PetscInt totalnz = a->bs2*a->nz; 2468 PetscScalar oalpha = alpha; 2469 PetscErrorCode ierr; 2470 PetscBLASInt one = 1,tnz; 2471 2472 PetscFunctionBegin; 2473 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2474 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2475 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2476 PetscFunctionReturn(0); 2477 } 2478 2479 #undef __FUNCT__ 2480 #define __FUNCT__ "MatNorm_SeqBAIJ" 2481 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2482 { 2483 PetscErrorCode ierr; 2484 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2485 MatScalar *v = a->a; 2486 PetscReal sum = 0.0; 2487 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2488 2489 PetscFunctionBegin; 2490 if (type == NORM_FROBENIUS) { 2491 for (i=0; i< bs2*nz; i++) { 2492 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2493 } 2494 *norm = PetscSqrtReal(sum); 2495 } else if (type == NORM_1) { /* maximum column sum */ 2496 PetscReal *tmp; 2497 PetscInt *bcol = a->j; 2498 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2499 for (i=0; i<nz; i++) { 2500 for (j=0; j<bs; j++) { 2501 k1 = bs*(*bcol) + j; /* column index */ 2502 for (k=0; k<bs; k++) { 2503 tmp[k1] += PetscAbsScalar(*v); v++; 2504 } 2505 } 2506 bcol++; 2507 } 2508 *norm = 0.0; 2509 for (j=0; j<A->cmap->n; j++) { 2510 if (tmp[j] > *norm) *norm = tmp[j]; 2511 } 2512 ierr = PetscFree(tmp);CHKERRQ(ierr); 2513 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2514 *norm = 0.0; 2515 for (k=0; k<bs; k++) { 2516 for (j=0; j<a->mbs; j++) { 2517 v = a->a + bs2*a->i[j] + k; 2518 sum = 0.0; 2519 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2520 for (k1=0; k1<bs; k1++) { 2521 sum += PetscAbsScalar(*v); 2522 v += bs; 2523 } 2524 } 2525 if (sum > *norm) *norm = sum; 2526 } 2527 } 2528 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2529 PetscFunctionReturn(0); 2530 } 2531 2532 2533 #undef __FUNCT__ 2534 #define __FUNCT__ "MatEqual_SeqBAIJ" 2535 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2536 { 2537 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2538 PetscErrorCode ierr; 2539 2540 PetscFunctionBegin; 2541 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2542 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2543 *flg = PETSC_FALSE; 2544 PetscFunctionReturn(0); 2545 } 2546 2547 /* if the a->i are the same */ 2548 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2549 if (!*flg) PetscFunctionReturn(0); 2550 2551 /* if a->j are the same */ 2552 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2553 if (!*flg) PetscFunctionReturn(0); 2554 2555 /* if a->a are the same */ 2556 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2557 PetscFunctionReturn(0); 2558 2559 } 2560 2561 #undef __FUNCT__ 2562 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 2563 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2564 { 2565 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2566 PetscErrorCode ierr; 2567 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2568 PetscScalar *x,zero = 0.0; 2569 MatScalar *aa,*aa_j; 2570 2571 PetscFunctionBegin; 2572 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2573 bs = A->rmap->bs; 2574 aa = a->a; 2575 ai = a->i; 2576 aj = a->j; 2577 ambs = a->mbs; 2578 bs2 = a->bs2; 2579 2580 ierr = VecSet(v,zero);CHKERRQ(ierr); 2581 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2582 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2583 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2584 for (i=0; i<ambs; i++) { 2585 for (j=ai[i]; j<ai[i+1]; j++) { 2586 if (aj[j] == i) { 2587 row = i*bs; 2588 aa_j = aa+j*bs2; 2589 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2590 break; 2591 } 2592 } 2593 } 2594 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2595 PetscFunctionReturn(0); 2596 } 2597 2598 #undef __FUNCT__ 2599 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2600 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2601 { 2602 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2603 const PetscScalar *l,*r,*li,*ri; 2604 PetscScalar x; 2605 MatScalar *aa, *v; 2606 PetscErrorCode ierr; 2607 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2608 const PetscInt *ai,*aj; 2609 2610 PetscFunctionBegin; 2611 ai = a->i; 2612 aj = a->j; 2613 aa = a->a; 2614 m = A->rmap->n; 2615 n = A->cmap->n; 2616 bs = A->rmap->bs; 2617 mbs = a->mbs; 2618 bs2 = a->bs2; 2619 if (ll) { 2620 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2621 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2622 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2623 for (i=0; i<mbs; i++) { /* for each block row */ 2624 M = ai[i+1] - ai[i]; 2625 li = l + i*bs; 2626 v = aa + bs2*ai[i]; 2627 for (j=0; j<M; j++) { /* for each block */ 2628 for (k=0; k<bs2; k++) { 2629 (*v++) *= li[k%bs]; 2630 } 2631 } 2632 } 2633 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2634 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2635 } 2636 2637 if (rr) { 2638 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2639 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2640 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2641 for (i=0; i<mbs; i++) { /* for each block row */ 2642 iai = ai[i]; 2643 M = ai[i+1] - iai; 2644 v = aa + bs2*iai; 2645 for (j=0; j<M; j++) { /* for each block */ 2646 ri = r + bs*aj[iai+j]; 2647 for (k=0; k<bs; k++) { 2648 x = ri[k]; 2649 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2650 v += bs; 2651 } 2652 } 2653 } 2654 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2655 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2656 } 2657 PetscFunctionReturn(0); 2658 } 2659 2660 2661 #undef __FUNCT__ 2662 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2663 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2664 { 2665 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2666 2667 PetscFunctionBegin; 2668 info->block_size = a->bs2; 2669 info->nz_allocated = a->bs2*a->maxnz; 2670 info->nz_used = a->bs2*a->nz; 2671 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2672 info->assemblies = A->num_ass; 2673 info->mallocs = A->info.mallocs; 2674 info->memory = ((PetscObject)A)->mem; 2675 if (A->factortype) { 2676 info->fill_ratio_given = A->info.fill_ratio_given; 2677 info->fill_ratio_needed = A->info.fill_ratio_needed; 2678 info->factor_mallocs = A->info.factor_mallocs; 2679 } else { 2680 info->fill_ratio_given = 0; 2681 info->fill_ratio_needed = 0; 2682 info->factor_mallocs = 0; 2683 } 2684 PetscFunctionReturn(0); 2685 } 2686 2687 2688 #if defined(PETSC_THREADCOMM_ACTIVE) 2689 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A) 2690 { 2691 PetscErrorCode ierr; 2692 PetscInt *trstarts=A->rmap->trstarts; 2693 PetscInt n,start,end; 2694 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2695 2696 start = trstarts[thread_id]/A->rmap->bs; 2697 end = trstarts[thread_id+1]/A->rmap->bs; 2698 n = a->i[end] - a->i[start]; 2699 ierr = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 2700 return 0; 2701 } 2702 2703 #undef __FUNCT__ 2704 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2705 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2706 { 2707 PetscErrorCode ierr; 2708 2709 2710 PetscFunctionBegin; 2711 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr); 2712 PetscFunctionReturn(0); 2713 } 2714 #else 2715 #undef __FUNCT__ 2716 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2717 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2718 { 2719 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2720 PetscErrorCode ierr; 2721 2722 PetscFunctionBegin; 2723 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2724 PetscFunctionReturn(0); 2725 } 2726 #endif 2727