1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 #if defined(PETSC_THREADCOMM_ACTIVE) 7 #include <petscthreadcomm.h> 8 #endif 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 79 { 80 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 81 PetscErrorCode ierr; 82 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 83 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 84 const PetscInt *irow,*icol; 85 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 86 PetscInt *aj = a->j,*ai = a->i; 87 MatScalar *mat_a; 88 Mat C; 89 PetscBool flag,sorted; 90 91 PetscFunctionBegin; 92 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 93 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 94 95 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 96 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 98 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 99 100 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 101 ssmap = smap; 102 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) lens[i]++; 111 } 112 } 113 /* Create and fill new matrix */ 114 if (scall == MAT_REUSE_MATRIX) { 115 c = (Mat_SeqBAIJ*)((*B)->data); 116 117 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 118 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 119 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 120 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 121 C = *B; 122 } else { 123 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 124 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 125 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 126 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 127 } 128 c = (Mat_SeqBAIJ*)(C->data); 129 for (i=0; i<nrows; i++) { 130 row = irow[i]; 131 kstart = ai[row]; 132 kend = kstart + a->ilen[row]; 133 mat_i = c->i[i]; 134 mat_j = c->j + mat_i; 135 mat_a = c->a + mat_i*bs2; 136 mat_ilen = c->ilen + i; 137 for (k=kstart; k<kend; k++) { 138 if ((tcol=ssmap[a->j[k]])) { 139 *mat_j++ = tcol - 1; 140 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 141 mat_a += bs2; 142 (*mat_ilen)++; 143 } 144 } 145 } 146 /* sort */ 147 { 148 MatScalar *work; 149 150 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 151 for (i=0; i<nrows; i++) { 152 PetscInt ilen; 153 mat_i = c->i[i]; 154 mat_j = c->j + mat_i; 155 mat_a = c->a + mat_i*bs2; 156 ilen = c->ilen[i]; 157 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 158 } 159 ierr = PetscFree(work);CHKERRQ(ierr); 160 } 161 162 /* Free work space */ 163 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 164 ierr = PetscFree(smap);CHKERRQ(ierr); 165 ierr = PetscFree(lens);CHKERRQ(ierr); 166 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 167 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 168 169 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 170 *B = C; 171 PetscFunctionReturn(0); 172 } 173 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 176 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 177 { 178 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 179 IS is1,is2; 180 PetscErrorCode ierr; 181 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 182 const PetscInt *irow,*icol; 183 184 PetscFunctionBegin; 185 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 186 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 187 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 188 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 189 190 /* Verify if the indices corespond to each element in a block 191 and form the IS with compressed IS */ 192 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 193 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 194 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 195 count = 0; 196 for (i=0; i<a->mbs; i++) { 197 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 198 if (vary[i]==bs) iary[count++] = i; 199 } 200 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 201 202 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 203 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 204 count = 0; 205 for (i=0; i<a->mbs; i++) { 206 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 207 if (vary[i]==bs) iary[count++] = i; 208 } 209 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 210 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 212 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 213 214 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 215 ierr = ISDestroy(&is1);CHKERRQ(ierr); 216 ierr = ISDestroy(&is2);CHKERRQ(ierr); 217 PetscFunctionReturn(0); 218 } 219 220 #undef __FUNCT__ 221 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 222 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 223 { 224 PetscErrorCode ierr; 225 PetscInt i; 226 227 PetscFunctionBegin; 228 if (scall == MAT_INITIAL_MATRIX) { 229 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 230 } 231 232 for (i=0; i<n; i++) { 233 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 234 } 235 PetscFunctionReturn(0); 236 } 237 238 239 /* -------------------------------------------------------*/ 240 /* Should check that shapes of vectors and matrices match */ 241 /* -------------------------------------------------------*/ 242 243 #if defined(PETSC_THREADCOMM_ACTIVE) 244 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 245 { 246 PetscErrorCode ierr; 247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 248 PetscScalar *z; 249 const PetscScalar *x; 250 const MatScalar *aa; 251 PetscInt *trstarts=A->rmap->trstarts; 252 PetscInt n,start,end,i; 253 const PetscInt *aj,*ai; 254 PetscScalar sum; 255 256 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 257 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 258 start = trstarts[thread_id]; 259 end = trstarts[thread_id+1]; 260 ai = a->i; 261 for (i=start; i<end; i++) { 262 n = ai[i+1] - ai[i]; 263 aj = a->j + ai[i]; 264 aa = a->a + ai[i]; 265 sum = 0.0; 266 PetscSparseDensePlusDot(sum,x,aa,aj,n); 267 z[i] = sum; 268 } 269 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 270 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 271 return 0; 272 } 273 274 #undef __FUNCT__ 275 #define __FUNCT__ "MatMult_SeqBAIJ_1" 276 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 277 { 278 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 279 PetscScalar *z,sum; 280 const PetscScalar *x; 281 const MatScalar *v; 282 PetscErrorCode ierr; 283 PetscInt mbs,i,n; 284 const PetscInt *idx,*ii,*ridx=NULL; 285 PetscBool usecprow=a->compressedrow.use; 286 287 PetscFunctionBegin; 288 289 if (usecprow) { 290 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 291 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 292 mbs = a->compressedrow.nrows; 293 ii = a->compressedrow.i; 294 ridx = a->compressedrow.rindex; 295 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 296 for (i=0; i<mbs; i++) { 297 n = ii[i+1] - ii[i]; 298 v = a->a + ii[i]; 299 idx = a->j + ii[i]; 300 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 301 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 302 sum = 0.0; 303 PetscSparseDensePlusDot(sum,x,v,idx,n); 304 z[ridx[i]] = sum; 305 } 306 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 307 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 308 } else { 309 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr); 310 } 311 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 312 PetscFunctionReturn(0); 313 } 314 #else 315 #undef __FUNCT__ 316 #define __FUNCT__ "MatMult_SeqBAIJ_1" 317 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 318 { 319 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 320 PetscScalar *z,sum; 321 const PetscScalar *x; 322 const MatScalar *v; 323 PetscErrorCode ierr; 324 PetscInt mbs,i,n; 325 const PetscInt *idx,*ii,*ridx=NULL; 326 PetscBool usecprow=a->compressedrow.use; 327 328 PetscFunctionBegin; 329 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 330 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 331 332 if (usecprow) { 333 mbs = a->compressedrow.nrows; 334 ii = a->compressedrow.i; 335 ridx = a->compressedrow.rindex; 336 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 337 } else { 338 mbs = a->mbs; 339 ii = a->i; 340 } 341 342 for (i=0; i<mbs; i++) { 343 n = ii[1] - ii[0]; 344 v = a->a + ii[0]; 345 idx = a->j + ii[0]; 346 ii++; 347 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 348 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 349 sum = 0.0; 350 PetscSparseDensePlusDot(sum,x,v,idx,n); 351 if (usecprow) { 352 z[ridx[i]] = sum; 353 } else { 354 z[i] = sum; 355 } 356 } 357 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 358 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 359 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 #endif 363 364 #if defined(PETSC_THREADCOMM_ACTIVE) 365 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 366 { 367 PetscErrorCode ierr; 368 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 369 PetscScalar *z,x1,x2,sum1,sum2; 370 const PetscScalar *x,*xb; 371 const MatScalar *aa; 372 PetscInt *trstarts=A->rmap->trstarts; 373 PetscInt n,start,end,i,j; 374 const PetscInt *aj,*ai; 375 376 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 377 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 378 start = trstarts[thread_id] / 2; 379 end = trstarts[thread_id+1] / 2; 380 ai = a->i; 381 for (i=start; i<end; i++) { 382 n = ai[i+1] - ai[i]; 383 aj = a->j + ai[i]; 384 aa = a->a + ai[i]*4; 385 sum1 = 0.0; sum2 = 0.0; 386 for (j=0; j<n; j++) { 387 xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1]; 388 sum1 += aa[4*j]*x1 + aa[4*j+2]*x2; 389 sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2; 390 } 391 z[2*i] = sum1; z[2*i+1] = sum2; 392 } 393 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 394 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 395 return 0; 396 } 397 398 #undef __FUNCT__ 399 #define __FUNCT__ "MatMult_SeqBAIJ_2" 400 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 401 { 402 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 403 PetscScalar *z,x1,x2,sum1,sum2; 404 const PetscScalar *x,*xb; 405 const MatScalar *v; 406 PetscErrorCode ierr; 407 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 408 PetscBool usecprow=a->compressedrow.use; 409 410 PetscFunctionBegin; 411 412 if (usecprow) { 413 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 414 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 415 mbs = a->compressedrow.nrows; 416 ii = a->compressedrow.i; 417 ridx = a->compressedrow.rindex; 418 for (i=0; i<mbs; i++) { 419 n = ii[i+1] - ii[i]; 420 idx = a->j + ii[i]; 421 v = a->a + ii[i]*4; 422 sum1 = 0.0; sum2 = 0.0; 423 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 424 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 425 for (j=0; j<n; j++) { 426 xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1]; 427 sum1 += v[4*j]*x1 + v[4*j+2]*x2; 428 sum2 += v[4*j+1]*x1 + v[4*j+3]*x2; 429 } 430 z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2; 431 } 432 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 433 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 434 } else { 435 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr); 436 } 437 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 438 PetscFunctionReturn(0); 439 } 440 #else 441 #undef __FUNCT__ 442 #define __FUNCT__ "MatMult_SeqBAIJ_2" 443 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 444 { 445 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 446 PetscScalar *z = 0,sum1,sum2,*zarray; 447 const PetscScalar *x,*xb; 448 PetscScalar x1,x2; 449 const MatScalar *v; 450 PetscErrorCode ierr; 451 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 452 PetscBool usecprow=a->compressedrow.use; 453 454 PetscFunctionBegin; 455 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 456 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 457 458 idx = a->j; 459 v = a->a; 460 if (usecprow) { 461 mbs = a->compressedrow.nrows; 462 ii = a->compressedrow.i; 463 ridx = a->compressedrow.rindex; 464 } else { 465 mbs = a->mbs; 466 ii = a->i; 467 z = zarray; 468 } 469 470 for (i=0; i<mbs; i++) { 471 n = ii[1] - ii[0]; ii++; 472 sum1 = 0.0; sum2 = 0.0; 473 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 474 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 475 for (j=0; j<n; j++) { 476 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 477 sum1 += v[0]*x1 + v[2]*x2; 478 sum2 += v[1]*x1 + v[3]*x2; 479 v += 4; 480 } 481 if (usecprow) z = zarray + 2*ridx[i]; 482 z[0] = sum1; z[1] = sum2; 483 if (!usecprow) z += 2; 484 } 485 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 486 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 487 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 488 PetscFunctionReturn(0); 489 } 490 #endif 491 492 #if defined(PETSC_THREADCOMM_ACTIVE) 493 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 494 { 495 PetscErrorCode ierr; 496 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 497 PetscScalar *z,x1,x2,x3,sum1,sum2,sum3; 498 const PetscScalar *x,*xb; 499 const MatScalar *aa; 500 PetscInt *trstarts=A->rmap->trstarts; 501 PetscInt n,start,end,i,j; 502 const PetscInt *aj,*ai; 503 504 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 505 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 506 start = trstarts[thread_id] / 3; 507 end = trstarts[thread_id+1] / 3; 508 ai = a->i; 509 for (i=start; i<end; i++) { 510 n = ai[i+1] - ai[i]; 511 aj = a->j + ai[i]; 512 aa = a->a + ai[i]*9; 513 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 514 for (j=0; j<n; j++) { 515 xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 516 sum1 += aa[9*j]*x1 + aa[9*j+3]*x2 + aa[9*j+6]*x3; 517 sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3; 518 sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3; 519 } 520 z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3; 521 } 522 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 523 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 524 return 0; 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatMult_SeqBAIJ_3" 529 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 530 { 531 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 532 PetscScalar *z,sum1,sum2,sum3,x1,x2,x3; 533 const PetscScalar *x,*xb; 534 const MatScalar *v; 535 PetscErrorCode ierr; 536 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 537 PetscBool usecprow=a->compressedrow.use; 538 539 540 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 541 #pragma disjoint(*v,*z,*xb) 542 #endif 543 544 PetscFunctionBegin; 545 546 if (usecprow) { 547 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 548 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 549 mbs = a->compressedrow.nrows; 550 ii = a->compressedrow.i; 551 ridx = a->compressedrow.rindex; 552 for (i=0; i<mbs; i++) { 553 n = ii[i+1] - ii[i]; 554 idx = a->j + ii[i]; 555 v = a->a + ii[i]*9; 556 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 557 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 558 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 559 for (j=0; j<n; j++) { 560 xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 561 sum1 += v[9*j]*x1 + v[9*j+3]*x2 + v[9*j+6]*x3; 562 sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3; 563 sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3; 564 } 565 z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3; 566 } 567 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 568 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 569 } else { 570 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr); 571 } 572 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 #else 576 #undef __FUNCT__ 577 #define __FUNCT__ "MatMult_SeqBAIJ_3" 578 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 579 { 580 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 581 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 582 const PetscScalar *x,*xb; 583 const MatScalar *v; 584 PetscErrorCode ierr; 585 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 586 PetscBool usecprow=a->compressedrow.use; 587 588 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 589 #pragma disjoint(*v,*z,*xb) 590 #endif 591 592 PetscFunctionBegin; 593 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 594 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 595 596 idx = a->j; 597 v = a->a; 598 if (usecprow) { 599 mbs = a->compressedrow.nrows; 600 ii = a->compressedrow.i; 601 ridx = a->compressedrow.rindex; 602 } else { 603 mbs = a->mbs; 604 ii = a->i; 605 z = zarray; 606 } 607 608 for (i=0; i<mbs; i++) { 609 n = ii[1] - ii[0]; ii++; 610 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 611 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 612 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 613 for (j=0; j<n; j++) { 614 xb = x + 3*(*idx++); 615 x1 = xb[0]; 616 x2 = xb[1]; 617 x3 = xb[2]; 618 619 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 620 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 621 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 622 v += 9; 623 } 624 if (usecprow) z = zarray + 3*ridx[i]; 625 z[0] = sum1; z[1] = sum2; z[2] = sum3; 626 if (!usecprow) z += 3; 627 } 628 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 629 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 630 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 631 PetscFunctionReturn(0); 632 } 633 #endif 634 635 #if defined(PETSC_THREADCOMM_ACTIVE) 636 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 640 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 641 const PetscScalar *x,*xb; 642 const MatScalar *aa; 643 PetscInt *trstarts=A->rmap->trstarts; 644 PetscInt n,start,end,i,j; 645 const PetscInt *aj,*ai; 646 647 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 648 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 649 start = trstarts[thread_id] / 4; 650 end = trstarts[thread_id+1] / 4; 651 ai = a->i; 652 for (i=start; i<end; i++) { 653 n = ai[i+1] - ai[i]; 654 aj = a->j + ai[i]; 655 aa = a->a + ai[i]*16; 656 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 657 for (j=0; j<n; j++) { 658 xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 659 sum1 += aa[16*j]*x1 + aa[16*j+4]*x2 + aa[16*j+8]*x3 + aa[16*j+12]*x4; 660 sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3 + aa[16*j+13]*x4; 661 sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4; 662 sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4; 663 } 664 z[4*i] = sum1; z[4*i+1] = sum2; 665 z[4*i+2] = sum3; z[4*i+3] = sum4; 666 } 667 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 668 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 669 return 0; 670 } 671 672 #undef __FUNCT__ 673 #define __FUNCT__ "MatMult_SeqBAIJ_4" 674 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 675 { 676 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 677 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 678 const PetscScalar *x,*xb; 679 const MatScalar *v; 680 PetscErrorCode ierr; 681 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 682 PetscBool usecprow=a->compressedrow.use; 683 684 PetscFunctionBegin; 685 686 if (usecprow) { 687 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 688 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 689 mbs = a->compressedrow.nrows; 690 ii = a->compressedrow.i; 691 ridx = a->compressedrow.rindex; 692 for (i=0; i<mbs; i++) { 693 n = ii[i+1] - ii[1]; 694 idx = a->j + ii[i]; 695 v = a->a + ii[i]*16; 696 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 697 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 698 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 699 for (j=0; j<n; j++) { 700 xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 701 sum1 += v[16*j]*x1 + v[16*j+4]*x2 + v[16*j+8]*x3 + v[16*j+12]*x4; 702 sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3 + v[16*j+13]*x4; 703 sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4; 704 sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4; 705 } 706 z[4*ridx[i]] = sum1; z[4*ridx[i]+1] = sum2; 707 z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4; 708 } 709 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 710 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 711 } else { 712 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr); 713 } 714 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 715 PetscFunctionReturn(0); 716 } 717 #else 718 #undef __FUNCT__ 719 #define __FUNCT__ "MatMult_SeqBAIJ_4" 720 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 721 { 722 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 723 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 724 const PetscScalar *x,*xb; 725 const MatScalar *v; 726 PetscErrorCode ierr; 727 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 728 PetscBool usecprow=a->compressedrow.use; 729 730 PetscFunctionBegin; 731 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 732 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 733 734 idx = a->j; 735 v = a->a; 736 if (usecprow) { 737 mbs = a->compressedrow.nrows; 738 ii = a->compressedrow.i; 739 ridx = a->compressedrow.rindex; 740 } else { 741 mbs = a->mbs; 742 ii = a->i; 743 z = zarray; 744 } 745 746 for (i=0; i<mbs; i++) { 747 n = ii[1] - ii[0]; 748 ii++; 749 sum1 = 0.0; 750 sum2 = 0.0; 751 sum3 = 0.0; 752 sum4 = 0.0; 753 754 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 755 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 756 for (j=0; j<n; j++) { 757 xb = x + 4*(*idx++); 758 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 759 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 760 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 761 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 762 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 763 v += 16; 764 } 765 if (usecprow) z = zarray + 4*ridx[i]; 766 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 767 if (!usecprow) z += 4; 768 } 769 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 771 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 #endif 775 776 #if defined(PETSC_THREADCOMM_ACTIVE) 777 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 778 { 779 PetscErrorCode ierr; 780 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 781 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 782 const PetscScalar *x,*xb; 783 const MatScalar *aa; 784 PetscInt *trstarts=A->rmap->trstarts; 785 PetscInt n,start,end,i,j; 786 const PetscInt *aj,*ai; 787 788 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 789 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 790 start = trstarts[thread_id] / 5; 791 end = trstarts[thread_id+1] / 5; 792 ai = a->i; 793 for (i=start; i<end; i++) { 794 n = ai[i+1] - ai[i]; 795 aj = a->j + ai[i]; 796 aa = a->a + ai[i]*25; 797 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 798 for (j=0; j<n; j++) { 799 xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 800 sum1 += aa[25*j]*x1 + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5; 801 sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5; 802 sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5; 803 sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5; 804 sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5; 805 } 806 z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; 807 z[5*i+3] = sum4; z[5*i+4] = sum5; 808 } 809 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 810 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 811 return 0; 812 } 813 814 #undef __FUNCT__ 815 #define __FUNCT__ "MatMult_SeqBAIJ_5" 816 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 817 { 818 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 819 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 820 const PetscScalar *xb,*x; 821 const MatScalar *v; 822 PetscErrorCode ierr; 823 const PetscInt *idx,*ii,*ridx=NULL; 824 PetscInt mbs,i,j,n; 825 PetscBool usecprow=a->compressedrow.use; 826 827 PetscFunctionBegin; 828 829 if (usecprow) { 830 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 831 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 832 mbs = a->compressedrow.nrows; 833 ii = a->compressedrow.i; 834 ridx = a->compressedrow.rindex; 835 for (i=0; i<mbs; i++) { 836 n = ii[i+1] - ii[i]; 837 idx = a->j + ii[i]; 838 v = a->a + ii[i]*25; 839 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 840 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 841 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 842 for (j=0; j<n; j++) { 843 xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 844 sum1 += v[25*j]*x1 + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5; 845 sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5; 846 sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5; 847 sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5; 848 sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5; 849 } 850 z[5*ridx[i]] = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3; 851 z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5; 852 } 853 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 854 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 855 } else { 856 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr); 857 } 858 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 859 PetscFunctionReturn(0); 860 } 861 #else 862 #undef __FUNCT__ 863 #define __FUNCT__ "MatMult_SeqBAIJ_5" 864 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 865 { 866 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 867 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 868 const PetscScalar *xb,*x; 869 const MatScalar *v; 870 PetscErrorCode ierr; 871 const PetscInt *idx,*ii,*ridx=NULL; 872 PetscInt mbs,i,j,n; 873 PetscBool usecprow=a->compressedrow.use; 874 875 PetscFunctionBegin; 876 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 877 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 878 879 idx = a->j; 880 v = a->a; 881 if (usecprow) { 882 mbs = a->compressedrow.nrows; 883 ii = a->compressedrow.i; 884 ridx = a->compressedrow.rindex; 885 } else { 886 mbs = a->mbs; 887 ii = a->i; 888 z = zarray; 889 } 890 891 for (i=0; i<mbs; i++) { 892 n = ii[1] - ii[0]; ii++; 893 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 894 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 895 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 896 for (j=0; j<n; j++) { 897 xb = x + 5*(*idx++); 898 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 899 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 900 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 901 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 902 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 903 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 904 v += 25; 905 } 906 if (usecprow) z = zarray + 5*ridx[i]; 907 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 908 if (!usecprow) z += 5; 909 } 910 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 911 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 912 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 913 PetscFunctionReturn(0); 914 } 915 #endif 916 917 918 #if defined(PETSC_THREADCOMM_ACTIVE) 919 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 920 { 921 PetscErrorCode ierr; 922 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 923 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 924 const PetscScalar *x,*xb; 925 const MatScalar *aa; 926 PetscInt *trstarts=A->rmap->trstarts; 927 PetscInt n,start,end,i,j; 928 const PetscInt *aj,*ai; 929 930 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 931 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 932 start = trstarts[thread_id] / 6; 933 end = trstarts[thread_id+1] / 6; 934 ai = a->i; 935 for (i=start; i<end; i++) { 936 n = ai[i+1] - ai[i]; 937 aj = a->j + ai[i]; 938 aa = a->a + ai[i]*36; 939 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 940 for (j=0; j<n; j++) { 941 xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 942 sum1 += aa[36*j]*x1 + aa[36*j+6]*x2 + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6; 943 sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2 + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6; 944 sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2 + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6; 945 sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2 + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6; 946 sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6; 947 sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6; 948 } 949 z[6*i] = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3; 950 z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6; 951 } 952 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 953 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 954 return 0; 955 } 956 957 #undef __FUNCT__ 958 #define __FUNCT__ "MatMult_SeqBAIJ_6" 959 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 960 { 961 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 962 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 963 const PetscScalar *x,*xb; 964 const MatScalar *v; 965 PetscErrorCode ierr; 966 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 967 PetscBool usecprow=a->compressedrow.use; 968 969 PetscFunctionBegin; 970 971 if (usecprow) { 972 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 973 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 974 mbs = a->compressedrow.nrows; 975 ii = a->compressedrow.i; 976 ridx = a->compressedrow.rindex; 977 for (i=0; i<mbs; i++) { 978 n = ii[i+1] - ii[i]; 979 idx = a->j + ii[i]; 980 v = a->a + ii[i]*36; 981 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 982 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 983 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 984 for (j=0; j<n; j++) { 985 xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 986 sum1 += v[36*j]*x1 + v[36*j+6]*x2 + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6; 987 sum2 += v[36*j+1]*x1 + v[36*j+7]*x2 + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6; 988 sum3 += v[36*j+2]*x1 + v[36*j+8]*x2 + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6; 989 sum4 += v[36*j+3]*x1 + v[36*j+9]*x2 + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6; 990 sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6; 991 sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6; 992 } 993 z[6*ridx[i]] = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3; 994 z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6; 995 } 996 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 997 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 998 } else { 999 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1000 } 1001 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 #else 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatMult_SeqBAIJ_6" 1007 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 1008 { 1009 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1010 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1011 const PetscScalar *x,*xb; 1012 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 1013 const MatScalar *v; 1014 PetscErrorCode ierr; 1015 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1016 PetscBool usecprow=a->compressedrow.use; 1017 1018 PetscFunctionBegin; 1019 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1020 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1021 1022 idx = a->j; 1023 v = a->a; 1024 if (usecprow) { 1025 mbs = a->compressedrow.nrows; 1026 ii = a->compressedrow.i; 1027 ridx = a->compressedrow.rindex; 1028 } else { 1029 mbs = a->mbs; 1030 ii = a->i; 1031 z = zarray; 1032 } 1033 1034 for (i=0; i<mbs; i++) { 1035 n = ii[1] - ii[0]; 1036 ii++; 1037 sum1 = 0.0; 1038 sum2 = 0.0; 1039 sum3 = 0.0; 1040 sum4 = 0.0; 1041 sum5 = 0.0; 1042 sum6 = 0.0; 1043 1044 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1045 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1046 for (j=0; j<n; j++) { 1047 xb = x + 6*(*idx++); 1048 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1049 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1050 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1051 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1052 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1053 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1054 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1055 v += 36; 1056 } 1057 if (usecprow) z = zarray + 6*ridx[i]; 1058 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1059 if (!usecprow) z += 6; 1060 } 1061 1062 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1063 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1064 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1065 PetscFunctionReturn(0); 1066 } 1067 #endif 1068 1069 #if defined(PETSC_THREADCOMM_ACTIVE) 1070 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 1071 { 1072 PetscErrorCode ierr; 1073 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1074 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1075 const PetscScalar *x,*xb; 1076 const MatScalar *aa; 1077 PetscInt *trstarts=A->rmap->trstarts; 1078 PetscInt n,start,end,i,j; 1079 const PetscInt *aj,*ai; 1080 1081 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1082 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1083 start = trstarts[thread_id] / 7; 1084 end = trstarts[thread_id+1] / 7; 1085 ai = a->i; 1086 for (i=start; i<end; i++) { 1087 n = ai[i+1] - ai[i]; 1088 aj = a->j + ai[i]; 1089 aa = a->a + ai[i]*49; 1090 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1091 for (j=0; j<n; j++) { 1092 xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1093 sum1 += aa[49*j]*x1 + aa[49*j+7]*x2 + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7; 1094 sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2 + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7; 1095 sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2 + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7; 1096 sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7; 1097 sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7; 1098 sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7; 1099 sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7; 1100 } 1101 z[7*i] = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4; 1102 z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7; 1103 } 1104 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1105 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1106 return 0; 1107 } 1108 1109 #undef __FUNCT__ 1110 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1111 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1112 { 1113 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1114 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1115 const PetscScalar *x,*xb; 1116 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1117 const MatScalar *v; 1118 PetscErrorCode ierr; 1119 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1120 PetscBool usecprow=a->compressedrow.use; 1121 1122 PetscFunctionBegin; 1123 1124 if (usecprow) { 1125 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1126 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1127 mbs = a->compressedrow.nrows; 1128 ii = a->compressedrow.i; 1129 ridx = a->compressedrow.rindex; 1130 for (i=0; i<mbs; i++) { 1131 n = ii[i+1] - ii[i]; 1132 idx = a->j + ii[i]; 1133 v = a->a + ii[i]*49; 1134 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1135 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1136 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1137 for (j=0; j<n; j++) { 1138 xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1139 sum1 += v[49*j]*x1 + v[49*j+7]*x2 + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7; 1140 sum2 += v[49*j+1]*x1 + v[49*j+8]*x2 + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7; 1141 sum3 += v[49*j+2]*x1 + v[49*j+9]*x2 + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7; 1142 sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7; 1143 sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7; 1144 sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7; 1145 sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7; 1146 } 1147 z[7*ridx[i]] = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4; 1148 z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7; 1149 } 1150 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1151 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1152 } else { 1153 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1154 } 1155 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1156 PetscFunctionReturn(0); 1157 } 1158 #else 1159 #undef __FUNCT__ 1160 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1161 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1162 { 1163 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1164 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1165 const PetscScalar *x,*xb; 1166 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1167 const MatScalar *v; 1168 PetscErrorCode ierr; 1169 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1170 PetscBool usecprow=a->compressedrow.use; 1171 1172 PetscFunctionBegin; 1173 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1174 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1175 1176 idx = a->j; 1177 v = a->a; 1178 if (usecprow) { 1179 mbs = a->compressedrow.nrows; 1180 ii = a->compressedrow.i; 1181 ridx = a->compressedrow.rindex; 1182 } else { 1183 mbs = a->mbs; 1184 ii = a->i; 1185 z = zarray; 1186 } 1187 1188 for (i=0; i<mbs; i++) { 1189 n = ii[1] - ii[0]; 1190 ii++; 1191 sum1 = 0.0; 1192 sum2 = 0.0; 1193 sum3 = 0.0; 1194 sum4 = 0.0; 1195 sum5 = 0.0; 1196 sum6 = 0.0; 1197 sum7 = 0.0; 1198 1199 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1200 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1201 for (j=0; j<n; j++) { 1202 xb = x + 7*(*idx++); 1203 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1204 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1205 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1206 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1207 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1208 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1209 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1210 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1211 v += 49; 1212 } 1213 if (usecprow) z = zarray + 7*ridx[i]; 1214 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1215 if (!usecprow) z += 7; 1216 } 1217 1218 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1219 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1220 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1221 PetscFunctionReturn(0); 1222 } 1223 #endif 1224 1225 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1226 /* Default MatMult for block size 15 */ 1227 1228 #undef __FUNCT__ 1229 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 1230 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1231 { 1232 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1233 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1234 const PetscScalar *x,*xb; 1235 PetscScalar *zarray,xv; 1236 const MatScalar *v; 1237 PetscErrorCode ierr; 1238 const PetscInt *ii,*ij=a->j,*idx; 1239 PetscInt mbs,i,j,k,n,*ridx=NULL; 1240 PetscBool usecprow=a->compressedrow.use; 1241 1242 PetscFunctionBegin; 1243 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1244 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1245 1246 v = a->a; 1247 if (usecprow) { 1248 mbs = a->compressedrow.nrows; 1249 ii = a->compressedrow.i; 1250 ridx = a->compressedrow.rindex; 1251 } else { 1252 mbs = a->mbs; 1253 ii = a->i; 1254 z = zarray; 1255 } 1256 1257 for (i=0; i<mbs; i++) { 1258 n = ii[i+1] - ii[i]; 1259 idx = ij + ii[i]; 1260 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1261 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1262 1263 for (j=0; j<n; j++) { 1264 xb = x + 15*(idx[j]); 1265 1266 for (k=0; k<15; k++) { 1267 xv = xb[k]; 1268 sum1 += v[0]*xv; 1269 sum2 += v[1]*xv; 1270 sum3 += v[2]*xv; 1271 sum4 += v[3]*xv; 1272 sum5 += v[4]*xv; 1273 sum6 += v[5]*xv; 1274 sum7 += v[6]*xv; 1275 sum8 += v[7]*xv; 1276 sum9 += v[8]*xv; 1277 sum10 += v[9]*xv; 1278 sum11 += v[10]*xv; 1279 sum12 += v[11]*xv; 1280 sum13 += v[12]*xv; 1281 sum14 += v[13]*xv; 1282 sum15 += v[14]*xv; 1283 v += 15; 1284 } 1285 } 1286 if (usecprow) z = zarray + 15*ridx[i]; 1287 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1288 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1289 1290 if (!usecprow) z += 15; 1291 } 1292 1293 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1294 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1295 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1296 PetscFunctionReturn(0); 1297 } 1298 1299 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1300 #undef __FUNCT__ 1301 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 1302 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1303 { 1304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1305 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1306 const PetscScalar *x,*xb; 1307 PetscScalar x1,x2,x3,x4,*zarray; 1308 const MatScalar *v; 1309 PetscErrorCode ierr; 1310 const PetscInt *ii,*ij=a->j,*idx; 1311 PetscInt mbs,i,j,n,*ridx=NULL; 1312 PetscBool usecprow=a->compressedrow.use; 1313 1314 PetscFunctionBegin; 1315 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1316 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1317 1318 v = a->a; 1319 if (usecprow) { 1320 mbs = a->compressedrow.nrows; 1321 ii = a->compressedrow.i; 1322 ridx = a->compressedrow.rindex; 1323 } else { 1324 mbs = a->mbs; 1325 ii = a->i; 1326 z = zarray; 1327 } 1328 1329 for (i=0; i<mbs; i++) { 1330 n = ii[i+1] - ii[i]; 1331 idx = ij + ii[i]; 1332 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1333 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1334 1335 for (j=0; j<n; j++) { 1336 xb = x + 15*(idx[j]); 1337 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1338 1339 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1340 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1341 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1342 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1343 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1344 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1345 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1346 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1347 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1348 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1349 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1350 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1351 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1352 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1353 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1354 1355 v += 60; 1356 1357 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1358 1359 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1360 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1361 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1362 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1363 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1364 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1365 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1366 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1367 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1368 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1369 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1370 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1371 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1372 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1373 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1374 v += 60; 1375 1376 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1377 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1378 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1379 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1380 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1381 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1382 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1383 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1384 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1385 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1386 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1387 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1388 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1389 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1390 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1391 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1392 v += 60; 1393 1394 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1395 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1396 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1397 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1398 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1399 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1400 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1401 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1402 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1403 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1404 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1405 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1406 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1407 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1408 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1409 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1410 v += 45; 1411 } 1412 if (usecprow) z = zarray + 15*ridx[i]; 1413 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1414 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1415 1416 if (!usecprow) z += 15; 1417 } 1418 1419 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1420 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1421 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1422 PetscFunctionReturn(0); 1423 } 1424 1425 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1426 #undef __FUNCT__ 1427 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 1428 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1429 { 1430 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1431 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1432 const PetscScalar *x,*xb; 1433 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1434 const MatScalar *v; 1435 PetscErrorCode ierr; 1436 const PetscInt *ii,*ij=a->j,*idx; 1437 PetscInt mbs,i,j,n,*ridx=NULL; 1438 PetscBool usecprow=a->compressedrow.use; 1439 1440 PetscFunctionBegin; 1441 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1442 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1443 1444 v = a->a; 1445 if (usecprow) { 1446 mbs = a->compressedrow.nrows; 1447 ii = a->compressedrow.i; 1448 ridx = a->compressedrow.rindex; 1449 } else { 1450 mbs = a->mbs; 1451 ii = a->i; 1452 z = zarray; 1453 } 1454 1455 for (i=0; i<mbs; i++) { 1456 n = ii[i+1] - ii[i]; 1457 idx = ij + ii[i]; 1458 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1459 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1460 1461 for (j=0; j<n; j++) { 1462 xb = x + 15*(idx[j]); 1463 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1464 x8 = xb[7]; 1465 1466 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1467 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1468 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1469 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1470 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1471 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1472 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1473 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1474 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1475 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1476 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1477 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1478 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1479 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1480 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1481 v += 120; 1482 1483 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1484 1485 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1486 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1487 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1488 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1489 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1490 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1491 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1492 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1493 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1494 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1495 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1496 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1497 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1498 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1499 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1500 v += 105; 1501 } 1502 if (usecprow) z = zarray + 15*ridx[i]; 1503 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1504 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1505 1506 if (!usecprow) z += 15; 1507 } 1508 1509 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1510 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1511 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1512 PetscFunctionReturn(0); 1513 } 1514 1515 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1516 1517 #undef __FUNCT__ 1518 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 1519 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1520 { 1521 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1522 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1523 const PetscScalar *x,*xb; 1524 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1525 const MatScalar *v; 1526 PetscErrorCode ierr; 1527 const PetscInt *ii,*ij=a->j,*idx; 1528 PetscInt mbs,i,j,n,*ridx=NULL; 1529 PetscBool usecprow=a->compressedrow.use; 1530 1531 PetscFunctionBegin; 1532 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1533 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1534 1535 v = a->a; 1536 if (usecprow) { 1537 mbs = a->compressedrow.nrows; 1538 ii = a->compressedrow.i; 1539 ridx = a->compressedrow.rindex; 1540 } else { 1541 mbs = a->mbs; 1542 ii = a->i; 1543 z = zarray; 1544 } 1545 1546 for (i=0; i<mbs; i++) { 1547 n = ii[i+1] - ii[i]; 1548 idx = ij + ii[i]; 1549 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1550 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1551 1552 for (j=0; j<n; j++) { 1553 xb = x + 15*(idx[j]); 1554 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1555 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1556 1557 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1558 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1559 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1560 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1561 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1562 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1563 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1564 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1565 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1566 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1567 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1568 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1569 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1570 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1571 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1572 v += 225; 1573 } 1574 if (usecprow) z = zarray + 15*ridx[i]; 1575 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1576 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1577 1578 if (!usecprow) z += 15; 1579 } 1580 1581 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1582 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1583 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1584 PetscFunctionReturn(0); 1585 } 1586 1587 1588 /* 1589 This will not work with MatScalar == float because it calls the BLAS 1590 */ 1591 #undef __FUNCT__ 1592 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1593 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1594 { 1595 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1596 PetscScalar *z = 0,*work,*workt,*zarray; 1597 const PetscScalar *x,*xb; 1598 const MatScalar *v; 1599 PetscErrorCode ierr; 1600 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1601 const PetscInt *idx,*ii,*ridx=NULL; 1602 PetscInt ncols,k; 1603 PetscBool usecprow=a->compressedrow.use; 1604 1605 PetscFunctionBegin; 1606 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1607 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1608 1609 idx = a->j; 1610 v = a->a; 1611 if (usecprow) { 1612 mbs = a->compressedrow.nrows; 1613 ii = a->compressedrow.i; 1614 ridx = a->compressedrow.rindex; 1615 } else { 1616 mbs = a->mbs; 1617 ii = a->i; 1618 z = zarray; 1619 } 1620 1621 if (!a->mult_work) { 1622 k = PetscMax(A->rmap->n,A->cmap->n); 1623 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1624 } 1625 work = a->mult_work; 1626 for (i=0; i<mbs; i++) { 1627 n = ii[1] - ii[0]; ii++; 1628 ncols = n*bs; 1629 workt = work; 1630 for (j=0; j<n; j++) { 1631 xb = x + bs*(*idx++); 1632 for (k=0; k<bs; k++) workt[k] = xb[k]; 1633 workt += bs; 1634 } 1635 if (usecprow) z = zarray + bs*ridx[i]; 1636 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1637 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1638 v += n*bs2; 1639 if (!usecprow) z += bs; 1640 } 1641 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1642 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1643 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1644 PetscFunctionReturn(0); 1645 } 1646 1647 #undef __FUNCT__ 1648 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1649 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1650 { 1651 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1652 const PetscScalar *x; 1653 PetscScalar *y,*z,sum; 1654 const MatScalar *v; 1655 PetscErrorCode ierr; 1656 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1657 const PetscInt *idx,*ii; 1658 PetscBool usecprow=a->compressedrow.use; 1659 1660 PetscFunctionBegin; 1661 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1662 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1663 1664 idx = a->j; 1665 v = a->a; 1666 if (usecprow) { 1667 if (zz != yy) { 1668 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1669 } 1670 mbs = a->compressedrow.nrows; 1671 ii = a->compressedrow.i; 1672 ridx = a->compressedrow.rindex; 1673 } else { 1674 ii = a->i; 1675 } 1676 1677 for (i=0; i<mbs; i++) { 1678 n = ii[1] - ii[0]; 1679 ii++; 1680 if (!usecprow) { 1681 sum = y[i]; 1682 } else { 1683 sum = y[ridx[i]]; 1684 } 1685 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1686 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1687 PetscSparseDensePlusDot(sum,x,v,idx,n); 1688 v += n; 1689 idx += n; 1690 if (usecprow) { 1691 z[ridx[i]] = sum; 1692 } else { 1693 z[i] = sum; 1694 } 1695 } 1696 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1697 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1698 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1699 PetscFunctionReturn(0); 1700 } 1701 1702 #undef __FUNCT__ 1703 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1704 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1705 { 1706 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1707 PetscScalar *y = 0,*z = 0,sum1,sum2; 1708 const PetscScalar *x,*xb; 1709 PetscScalar x1,x2,*yarray,*zarray; 1710 const MatScalar *v; 1711 PetscErrorCode ierr; 1712 PetscInt mbs = a->mbs,i,n,j; 1713 const PetscInt *idx,*ii,*ridx = NULL; 1714 PetscBool usecprow = a->compressedrow.use; 1715 1716 PetscFunctionBegin; 1717 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1718 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1719 1720 idx = a->j; 1721 v = a->a; 1722 if (usecprow) { 1723 if (zz != yy) { 1724 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1725 } 1726 mbs = a->compressedrow.nrows; 1727 ii = a->compressedrow.i; 1728 ridx = a->compressedrow.rindex; 1729 if (zz != yy) { 1730 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1731 } 1732 } else { 1733 ii = a->i; 1734 y = yarray; 1735 z = zarray; 1736 } 1737 1738 for (i=0; i<mbs; i++) { 1739 n = ii[1] - ii[0]; ii++; 1740 if (usecprow) { 1741 z = zarray + 2*ridx[i]; 1742 y = yarray + 2*ridx[i]; 1743 } 1744 sum1 = y[0]; sum2 = y[1]; 1745 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1746 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1747 for (j=0; j<n; j++) { 1748 xb = x + 2*(*idx++); 1749 x1 = xb[0]; 1750 x2 = xb[1]; 1751 1752 sum1 += v[0]*x1 + v[2]*x2; 1753 sum2 += v[1]*x1 + v[3]*x2; 1754 v += 4; 1755 } 1756 z[0] = sum1; z[1] = sum2; 1757 if (!usecprow) { 1758 z += 2; y += 2; 1759 } 1760 } 1761 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1762 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1763 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1764 PetscFunctionReturn(0); 1765 } 1766 1767 #undef __FUNCT__ 1768 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1769 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1770 { 1771 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1772 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1773 const PetscScalar *x,*xb; 1774 const MatScalar *v; 1775 PetscErrorCode ierr; 1776 PetscInt mbs = a->mbs,i,j,n; 1777 const PetscInt *idx,*ii,*ridx = NULL; 1778 PetscBool usecprow = a->compressedrow.use; 1779 1780 PetscFunctionBegin; 1781 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1782 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1783 1784 idx = a->j; 1785 v = a->a; 1786 if (usecprow) { 1787 if (zz != yy) { 1788 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1789 } 1790 mbs = a->compressedrow.nrows; 1791 ii = a->compressedrow.i; 1792 ridx = a->compressedrow.rindex; 1793 } else { 1794 ii = a->i; 1795 y = yarray; 1796 z = zarray; 1797 } 1798 1799 for (i=0; i<mbs; i++) { 1800 n = ii[1] - ii[0]; ii++; 1801 if (usecprow) { 1802 z = zarray + 3*ridx[i]; 1803 y = yarray + 3*ridx[i]; 1804 } 1805 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1806 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1807 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1808 for (j=0; j<n; j++) { 1809 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1810 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1811 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1812 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1813 v += 9; 1814 } 1815 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1816 if (!usecprow) { 1817 z += 3; y += 3; 1818 } 1819 } 1820 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1821 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1822 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1823 PetscFunctionReturn(0); 1824 } 1825 1826 #undef __FUNCT__ 1827 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1828 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1829 { 1830 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1831 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1832 const PetscScalar *x,*xb; 1833 const MatScalar *v; 1834 PetscErrorCode ierr; 1835 PetscInt mbs = a->mbs,i,j,n; 1836 const PetscInt *idx,*ii,*ridx=NULL; 1837 PetscBool usecprow=a->compressedrow.use; 1838 1839 PetscFunctionBegin; 1840 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1841 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1842 1843 idx = a->j; 1844 v = a->a; 1845 if (usecprow) { 1846 if (zz != yy) { 1847 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1848 } 1849 mbs = a->compressedrow.nrows; 1850 ii = a->compressedrow.i; 1851 ridx = a->compressedrow.rindex; 1852 } else { 1853 ii = a->i; 1854 y = yarray; 1855 z = zarray; 1856 } 1857 1858 for (i=0; i<mbs; i++) { 1859 n = ii[1] - ii[0]; ii++; 1860 if (usecprow) { 1861 z = zarray + 4*ridx[i]; 1862 y = yarray + 4*ridx[i]; 1863 } 1864 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1865 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1866 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1867 for (j=0; j<n; j++) { 1868 xb = x + 4*(*idx++); 1869 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1870 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1871 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1872 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1873 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1874 v += 16; 1875 } 1876 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1877 if (!usecprow) { 1878 z += 4; y += 4; 1879 } 1880 } 1881 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1882 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1883 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1884 PetscFunctionReturn(0); 1885 } 1886 1887 #undef __FUNCT__ 1888 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1889 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1890 { 1891 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1892 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1893 const PetscScalar *x,*xb; 1894 PetscScalar *yarray,*zarray; 1895 const MatScalar *v; 1896 PetscErrorCode ierr; 1897 PetscInt mbs = a->mbs,i,j,n; 1898 const PetscInt *idx,*ii,*ridx = NULL; 1899 PetscBool usecprow=a->compressedrow.use; 1900 1901 PetscFunctionBegin; 1902 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1903 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1904 1905 idx = a->j; 1906 v = a->a; 1907 if (usecprow) { 1908 if (zz != yy) { 1909 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1910 } 1911 mbs = a->compressedrow.nrows; 1912 ii = a->compressedrow.i; 1913 ridx = a->compressedrow.rindex; 1914 } else { 1915 ii = a->i; 1916 y = yarray; 1917 z = zarray; 1918 } 1919 1920 for (i=0; i<mbs; i++) { 1921 n = ii[1] - ii[0]; ii++; 1922 if (usecprow) { 1923 z = zarray + 5*ridx[i]; 1924 y = yarray + 5*ridx[i]; 1925 } 1926 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1927 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1928 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1929 for (j=0; j<n; j++) { 1930 xb = x + 5*(*idx++); 1931 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1932 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1933 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1934 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1935 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1936 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1937 v += 25; 1938 } 1939 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1940 if (!usecprow) { 1941 z += 5; y += 5; 1942 } 1943 } 1944 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1945 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1946 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 #undef __FUNCT__ 1950 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1951 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1952 { 1953 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1954 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1955 const PetscScalar *x,*xb; 1956 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1957 const MatScalar *v; 1958 PetscErrorCode ierr; 1959 PetscInt mbs = a->mbs,i,j,n; 1960 const PetscInt *idx,*ii,*ridx=NULL; 1961 PetscBool usecprow=a->compressedrow.use; 1962 1963 PetscFunctionBegin; 1964 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1965 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1966 1967 idx = a->j; 1968 v = a->a; 1969 if (usecprow) { 1970 if (zz != yy) { 1971 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1972 } 1973 mbs = a->compressedrow.nrows; 1974 ii = a->compressedrow.i; 1975 ridx = a->compressedrow.rindex; 1976 } else { 1977 ii = a->i; 1978 y = yarray; 1979 z = zarray; 1980 } 1981 1982 for (i=0; i<mbs; i++) { 1983 n = ii[1] - ii[0]; ii++; 1984 if (usecprow) { 1985 z = zarray + 6*ridx[i]; 1986 y = yarray + 6*ridx[i]; 1987 } 1988 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1989 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1990 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1991 for (j=0; j<n; j++) { 1992 xb = x + 6*(*idx++); 1993 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1994 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1995 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1996 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1997 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1998 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1999 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2000 v += 36; 2001 } 2002 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2003 if (!usecprow) { 2004 z += 6; y += 6; 2005 } 2006 } 2007 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2008 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2009 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2010 PetscFunctionReturn(0); 2011 } 2012 2013 #undef __FUNCT__ 2014 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 2015 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2016 { 2017 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2018 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2019 const PetscScalar *x,*xb; 2020 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2021 const MatScalar *v; 2022 PetscErrorCode ierr; 2023 PetscInt mbs = a->mbs,i,j,n; 2024 const PetscInt *idx,*ii,*ridx = NULL; 2025 PetscBool usecprow=a->compressedrow.use; 2026 2027 PetscFunctionBegin; 2028 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2029 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2030 2031 idx = a->j; 2032 v = a->a; 2033 if (usecprow) { 2034 if (zz != yy) { 2035 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2036 } 2037 mbs = a->compressedrow.nrows; 2038 ii = a->compressedrow.i; 2039 ridx = a->compressedrow.rindex; 2040 } else { 2041 ii = a->i; 2042 y = yarray; 2043 z = zarray; 2044 } 2045 2046 for (i=0; i<mbs; i++) { 2047 n = ii[1] - ii[0]; ii++; 2048 if (usecprow) { 2049 z = zarray + 7*ridx[i]; 2050 y = yarray + 7*ridx[i]; 2051 } 2052 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2053 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2054 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2055 for (j=0; j<n; j++) { 2056 xb = x + 7*(*idx++); 2057 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2058 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2059 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2060 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2061 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2062 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2063 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2064 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2065 v += 49; 2066 } 2067 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2068 if (!usecprow) { 2069 z += 7; y += 7; 2070 } 2071 } 2072 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2073 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2074 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2075 PetscFunctionReturn(0); 2076 } 2077 2078 #undef __FUNCT__ 2079 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 2080 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2081 { 2082 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2083 PetscScalar *z = 0,*work,*workt,*zarray; 2084 const PetscScalar *x,*xb; 2085 const MatScalar *v; 2086 PetscErrorCode ierr; 2087 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2088 PetscInt ncols,k; 2089 const PetscInt *ridx = NULL,*idx,*ii; 2090 PetscBool usecprow = a->compressedrow.use; 2091 2092 PetscFunctionBegin; 2093 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2094 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2095 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2096 2097 idx = a->j; 2098 v = a->a; 2099 if (usecprow) { 2100 mbs = a->compressedrow.nrows; 2101 ii = a->compressedrow.i; 2102 ridx = a->compressedrow.rindex; 2103 } else { 2104 mbs = a->mbs; 2105 ii = a->i; 2106 z = zarray; 2107 } 2108 2109 if (!a->mult_work) { 2110 k = PetscMax(A->rmap->n,A->cmap->n); 2111 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2112 } 2113 work = a->mult_work; 2114 for (i=0; i<mbs; i++) { 2115 n = ii[1] - ii[0]; ii++; 2116 ncols = n*bs; 2117 workt = work; 2118 for (j=0; j<n; j++) { 2119 xb = x + bs*(*idx++); 2120 for (k=0; k<bs; k++) workt[k] = xb[k]; 2121 workt += bs; 2122 } 2123 if (usecprow) z = zarray + bs*ridx[i]; 2124 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2125 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 2126 v += n*bs2; 2127 if (!usecprow) z += bs; 2128 } 2129 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2130 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2131 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2132 PetscFunctionReturn(0); 2133 } 2134 2135 #undef __FUNCT__ 2136 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 2137 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2138 { 2139 PetscScalar zero = 0.0; 2140 PetscErrorCode ierr; 2141 2142 PetscFunctionBegin; 2143 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2144 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2145 PetscFunctionReturn(0); 2146 } 2147 2148 #undef __FUNCT__ 2149 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 2150 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2151 { 2152 PetscScalar zero = 0.0; 2153 PetscErrorCode ierr; 2154 2155 PetscFunctionBegin; 2156 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2157 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2158 PetscFunctionReturn(0); 2159 } 2160 2161 #undef __FUNCT__ 2162 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 2163 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2164 { 2165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2166 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2167 const PetscScalar *x,*xb = NULL; 2168 const MatScalar *v; 2169 PetscErrorCode ierr; 2170 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2171 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2172 Mat_CompressedRow cprow = a->compressedrow; 2173 PetscBool usecprow = cprow.use; 2174 2175 PetscFunctionBegin; 2176 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2177 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2178 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2179 2180 idx = a->j; 2181 v = a->a; 2182 if (usecprow) { 2183 mbs = cprow.nrows; 2184 ii = cprow.i; 2185 ridx = cprow.rindex; 2186 } else { 2187 mbs=a->mbs; 2188 ii = a->i; 2189 xb = x; 2190 } 2191 2192 switch (bs) { 2193 case 1: 2194 for (i=0; i<mbs; i++) { 2195 if (usecprow) xb = x + ridx[i]; 2196 x1 = xb[0]; 2197 ib = idx + ii[0]; 2198 n = ii[1] - ii[0]; ii++; 2199 for (j=0; j<n; j++) { 2200 rval = ib[j]; 2201 z[rval] += PetscConj(*v) * x1; 2202 v++; 2203 } 2204 if (!usecprow) xb++; 2205 } 2206 break; 2207 case 2: 2208 for (i=0; i<mbs; i++) { 2209 if (usecprow) xb = x + 2*ridx[i]; 2210 x1 = xb[0]; x2 = xb[1]; 2211 ib = idx + ii[0]; 2212 n = ii[1] - ii[0]; ii++; 2213 for (j=0; j<n; j++) { 2214 rval = ib[j]*2; 2215 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2216 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2217 v += 4; 2218 } 2219 if (!usecprow) xb += 2; 2220 } 2221 break; 2222 case 3: 2223 for (i=0; i<mbs; i++) { 2224 if (usecprow) xb = x + 3*ridx[i]; 2225 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2226 ib = idx + ii[0]; 2227 n = ii[1] - ii[0]; ii++; 2228 for (j=0; j<n; j++) { 2229 rval = ib[j]*3; 2230 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2231 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2232 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2233 v += 9; 2234 } 2235 if (!usecprow) xb += 3; 2236 } 2237 break; 2238 case 4: 2239 for (i=0; i<mbs; i++) { 2240 if (usecprow) xb = x + 4*ridx[i]; 2241 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2242 ib = idx + ii[0]; 2243 n = ii[1] - ii[0]; ii++; 2244 for (j=0; j<n; j++) { 2245 rval = ib[j]*4; 2246 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2247 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2248 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2249 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2250 v += 16; 2251 } 2252 if (!usecprow) xb += 4; 2253 } 2254 break; 2255 case 5: 2256 for (i=0; i<mbs; i++) { 2257 if (usecprow) xb = x + 5*ridx[i]; 2258 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2259 x4 = xb[3]; x5 = xb[4]; 2260 ib = idx + ii[0]; 2261 n = ii[1] - ii[0]; ii++; 2262 for (j=0; j<n; j++) { 2263 rval = ib[j]*5; 2264 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2265 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2266 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2267 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2268 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2269 v += 25; 2270 } 2271 if (!usecprow) xb += 5; 2272 } 2273 break; 2274 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 2275 PetscInt ncols,k; 2276 PetscScalar *work,*workt; 2277 const PetscScalar *xtmp; 2278 2279 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2280 if (!a->mult_work) { 2281 k = PetscMax(A->rmap->n,A->cmap->n); 2282 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2283 } 2284 work = a->mult_work; 2285 xtmp = x; 2286 for (i=0; i<mbs; i++) { 2287 n = ii[1] - ii[0]; ii++; 2288 ncols = n*bs; 2289 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2290 if (usecprow) xtmp = x + bs*ridx[i]; 2291 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2292 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2293 v += n*bs2; 2294 if (!usecprow) xtmp += bs; 2295 workt = work; 2296 for (j=0; j<n; j++) { 2297 zb = z + bs*(*idx++); 2298 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2299 workt += bs; 2300 } 2301 } 2302 } 2303 } 2304 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2305 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2306 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2307 PetscFunctionReturn(0); 2308 } 2309 2310 #undef __FUNCT__ 2311 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 2312 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2313 { 2314 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2315 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2316 const PetscScalar *x,*xb = 0; 2317 const MatScalar *v; 2318 PetscErrorCode ierr; 2319 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2320 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2321 Mat_CompressedRow cprow = a->compressedrow; 2322 PetscBool usecprow=cprow.use; 2323 2324 PetscFunctionBegin; 2325 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2326 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2327 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2328 2329 idx = a->j; 2330 v = a->a; 2331 if (usecprow) { 2332 mbs = cprow.nrows; 2333 ii = cprow.i; 2334 ridx = cprow.rindex; 2335 } else { 2336 mbs=a->mbs; 2337 ii = a->i; 2338 xb = x; 2339 } 2340 2341 switch (bs) { 2342 case 1: 2343 for (i=0; i<mbs; i++) { 2344 if (usecprow) xb = x + ridx[i]; 2345 x1 = xb[0]; 2346 ib = idx + ii[0]; 2347 n = ii[1] - ii[0]; ii++; 2348 for (j=0; j<n; j++) { 2349 rval = ib[j]; 2350 z[rval] += *v * x1; 2351 v++; 2352 } 2353 if (!usecprow) xb++; 2354 } 2355 break; 2356 case 2: 2357 for (i=0; i<mbs; i++) { 2358 if (usecprow) xb = x + 2*ridx[i]; 2359 x1 = xb[0]; x2 = xb[1]; 2360 ib = idx + ii[0]; 2361 n = ii[1] - ii[0]; ii++; 2362 for (j=0; j<n; j++) { 2363 rval = ib[j]*2; 2364 z[rval++] += v[0]*x1 + v[1]*x2; 2365 z[rval++] += v[2]*x1 + v[3]*x2; 2366 v += 4; 2367 } 2368 if (!usecprow) xb += 2; 2369 } 2370 break; 2371 case 3: 2372 for (i=0; i<mbs; i++) { 2373 if (usecprow) xb = x + 3*ridx[i]; 2374 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2375 ib = idx + ii[0]; 2376 n = ii[1] - ii[0]; ii++; 2377 for (j=0; j<n; j++) { 2378 rval = ib[j]*3; 2379 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2380 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2381 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2382 v += 9; 2383 } 2384 if (!usecprow) xb += 3; 2385 } 2386 break; 2387 case 4: 2388 for (i=0; i<mbs; i++) { 2389 if (usecprow) xb = x + 4*ridx[i]; 2390 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2391 ib = idx + ii[0]; 2392 n = ii[1] - ii[0]; ii++; 2393 for (j=0; j<n; j++) { 2394 rval = ib[j]*4; 2395 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2396 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2397 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2398 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2399 v += 16; 2400 } 2401 if (!usecprow) xb += 4; 2402 } 2403 break; 2404 case 5: 2405 for (i=0; i<mbs; i++) { 2406 if (usecprow) xb = x + 5*ridx[i]; 2407 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2408 x4 = xb[3]; x5 = xb[4]; 2409 ib = idx + ii[0]; 2410 n = ii[1] - ii[0]; ii++; 2411 for (j=0; j<n; j++) { 2412 rval = ib[j]*5; 2413 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2414 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2415 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2416 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2417 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2418 v += 25; 2419 } 2420 if (!usecprow) xb += 5; 2421 } 2422 break; 2423 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2424 PetscInt ncols,k; 2425 PetscScalar *work,*workt; 2426 const PetscScalar *xtmp; 2427 if (!a->mult_work) { 2428 k = PetscMax(A->rmap->n,A->cmap->n); 2429 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2430 } 2431 work = a->mult_work; 2432 xtmp = x; 2433 for (i=0; i<mbs; i++) { 2434 n = ii[1] - ii[0]; ii++; 2435 ncols = n*bs; 2436 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2437 if (usecprow) xtmp = x + bs*ridx[i]; 2438 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2439 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2440 v += n*bs2; 2441 if (!usecprow) xtmp += bs; 2442 workt = work; 2443 for (j=0; j<n; j++) { 2444 zb = z + bs*(*idx++); 2445 for (k=0; k<bs; k++) zb[k] += workt[k]; 2446 workt += bs; 2447 } 2448 } 2449 } 2450 } 2451 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2452 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2453 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2454 PetscFunctionReturn(0); 2455 } 2456 2457 #undef __FUNCT__ 2458 #define __FUNCT__ "MatScale_SeqBAIJ" 2459 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2460 { 2461 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2462 PetscInt totalnz = a->bs2*a->nz; 2463 PetscScalar oalpha = alpha; 2464 PetscErrorCode ierr; 2465 PetscBLASInt one = 1,tnz; 2466 2467 PetscFunctionBegin; 2468 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2469 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2470 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2471 PetscFunctionReturn(0); 2472 } 2473 2474 #undef __FUNCT__ 2475 #define __FUNCT__ "MatNorm_SeqBAIJ" 2476 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2477 { 2478 PetscErrorCode ierr; 2479 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2480 MatScalar *v = a->a; 2481 PetscReal sum = 0.0; 2482 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2483 2484 PetscFunctionBegin; 2485 if (type == NORM_FROBENIUS) { 2486 for (i=0; i< bs2*nz; i++) { 2487 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2488 } 2489 *norm = PetscSqrtReal(sum); 2490 } else if (type == NORM_1) { /* maximum column sum */ 2491 PetscReal *tmp; 2492 PetscInt *bcol = a->j; 2493 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2494 for (i=0; i<nz; i++) { 2495 for (j=0; j<bs; j++) { 2496 k1 = bs*(*bcol) + j; /* column index */ 2497 for (k=0; k<bs; k++) { 2498 tmp[k1] += PetscAbsScalar(*v); v++; 2499 } 2500 } 2501 bcol++; 2502 } 2503 *norm = 0.0; 2504 for (j=0; j<A->cmap->n; j++) { 2505 if (tmp[j] > *norm) *norm = tmp[j]; 2506 } 2507 ierr = PetscFree(tmp);CHKERRQ(ierr); 2508 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2509 *norm = 0.0; 2510 for (k=0; k<bs; k++) { 2511 for (j=0; j<a->mbs; j++) { 2512 v = a->a + bs2*a->i[j] + k; 2513 sum = 0.0; 2514 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2515 for (k1=0; k1<bs; k1++) { 2516 sum += PetscAbsScalar(*v); 2517 v += bs; 2518 } 2519 } 2520 if (sum > *norm) *norm = sum; 2521 } 2522 } 2523 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2524 PetscFunctionReturn(0); 2525 } 2526 2527 2528 #undef __FUNCT__ 2529 #define __FUNCT__ "MatEqual_SeqBAIJ" 2530 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2531 { 2532 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2533 PetscErrorCode ierr; 2534 2535 PetscFunctionBegin; 2536 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2537 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2538 *flg = PETSC_FALSE; 2539 PetscFunctionReturn(0); 2540 } 2541 2542 /* if the a->i are the same */ 2543 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2544 if (!*flg) PetscFunctionReturn(0); 2545 2546 /* if a->j are the same */ 2547 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2548 if (!*flg) PetscFunctionReturn(0); 2549 2550 /* if a->a are the same */ 2551 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2552 PetscFunctionReturn(0); 2553 2554 } 2555 2556 #undef __FUNCT__ 2557 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 2558 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2559 { 2560 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2561 PetscErrorCode ierr; 2562 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2563 PetscScalar *x,zero = 0.0; 2564 MatScalar *aa,*aa_j; 2565 2566 PetscFunctionBegin; 2567 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2568 bs = A->rmap->bs; 2569 aa = a->a; 2570 ai = a->i; 2571 aj = a->j; 2572 ambs = a->mbs; 2573 bs2 = a->bs2; 2574 2575 ierr = VecSet(v,zero);CHKERRQ(ierr); 2576 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2577 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2578 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2579 for (i=0; i<ambs; i++) { 2580 for (j=ai[i]; j<ai[i+1]; j++) { 2581 if (aj[j] == i) { 2582 row = i*bs; 2583 aa_j = aa+j*bs2; 2584 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2585 break; 2586 } 2587 } 2588 } 2589 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2590 PetscFunctionReturn(0); 2591 } 2592 2593 #undef __FUNCT__ 2594 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2595 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2596 { 2597 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2598 const PetscScalar *l,*r,*li,*ri; 2599 PetscScalar x; 2600 MatScalar *aa, *v; 2601 PetscErrorCode ierr; 2602 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2603 const PetscInt *ai,*aj; 2604 2605 PetscFunctionBegin; 2606 ai = a->i; 2607 aj = a->j; 2608 aa = a->a; 2609 m = A->rmap->n; 2610 n = A->cmap->n; 2611 bs = A->rmap->bs; 2612 mbs = a->mbs; 2613 bs2 = a->bs2; 2614 if (ll) { 2615 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2616 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2617 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2618 for (i=0; i<mbs; i++) { /* for each block row */ 2619 M = ai[i+1] - ai[i]; 2620 li = l + i*bs; 2621 v = aa + bs2*ai[i]; 2622 for (j=0; j<M; j++) { /* for each block */ 2623 for (k=0; k<bs2; k++) { 2624 (*v++) *= li[k%bs]; 2625 } 2626 } 2627 } 2628 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2629 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2630 } 2631 2632 if (rr) { 2633 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2634 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2635 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2636 for (i=0; i<mbs; i++) { /* for each block row */ 2637 iai = ai[i]; 2638 M = ai[i+1] - iai; 2639 v = aa + bs2*iai; 2640 for (j=0; j<M; j++) { /* for each block */ 2641 ri = r + bs*aj[iai+j]; 2642 for (k=0; k<bs; k++) { 2643 x = ri[k]; 2644 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2645 v += bs; 2646 } 2647 } 2648 } 2649 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2650 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2651 } 2652 PetscFunctionReturn(0); 2653 } 2654 2655 2656 #undef __FUNCT__ 2657 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2658 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2659 { 2660 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2661 2662 PetscFunctionBegin; 2663 info->block_size = a->bs2; 2664 info->nz_allocated = a->bs2*a->maxnz; 2665 info->nz_used = a->bs2*a->nz; 2666 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2667 info->assemblies = A->num_ass; 2668 info->mallocs = A->info.mallocs; 2669 info->memory = ((PetscObject)A)->mem; 2670 if (A->factortype) { 2671 info->fill_ratio_given = A->info.fill_ratio_given; 2672 info->fill_ratio_needed = A->info.fill_ratio_needed; 2673 info->factor_mallocs = A->info.factor_mallocs; 2674 } else { 2675 info->fill_ratio_given = 0; 2676 info->fill_ratio_needed = 0; 2677 info->factor_mallocs = 0; 2678 } 2679 PetscFunctionReturn(0); 2680 } 2681 2682 2683 #if defined(PETSC_THREADCOMM_ACTIVE) 2684 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A) 2685 { 2686 PetscErrorCode ierr; 2687 PetscInt *trstarts=A->rmap->trstarts; 2688 PetscInt n,start,end; 2689 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2690 2691 start = trstarts[thread_id]/A->rmap->bs; 2692 end = trstarts[thread_id+1]/A->rmap->bs; 2693 n = a->i[end] - a->i[start]; 2694 ierr = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 2695 return 0; 2696 } 2697 2698 #undef __FUNCT__ 2699 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2700 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2701 { 2702 PetscErrorCode ierr; 2703 2704 2705 PetscFunctionBegin; 2706 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr); 2707 PetscFunctionReturn(0); 2708 } 2709 #else 2710 #undef __FUNCT__ 2711 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2712 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2713 { 2714 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2715 PetscErrorCode ierr; 2716 2717 PetscFunctionBegin; 2718 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2719 PetscFunctionReturn(0); 2720 } 2721 #endif 2722