1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <../src/mat/blockinvert.h> 4 #include <petscbt.h> 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <petscblaslapack.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2; 15 const PetscInt *idx; 16 PetscBT table_out,table_in; 17 18 PetscFunctionBegin; 19 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = A->rmap->bs; 24 ierr = PetscBTCreate(mbs,table_out);CHKERRQ(ierr); 25 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 27 ierr = PetscBTCreate(mbs,table_in);CHKERRQ(ierr); 28 29 for (i=0; i<is_max; i++) { /* for each is */ 30 isz = 0; 31 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */ 38 bcol_max = 0; 39 for (j=0; j<n ; ++j){ 40 brow = idx[j]/bs; /* convert the indices into block indices */ 41 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 42 if(!PetscBTLookupSet(table_out,brow)) { 43 nidx[isz++] = brow; 44 if (bcol_max < brow) bcol_max = brow; 45 } 46 } 47 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 48 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 49 50 k = 0; 51 for (j=0; j<ov; j++){ /* for each overlap */ 52 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 53 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr); 54 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); } 55 56 n = isz; /* length of the updated is[i] */ 57 for (brow=0; brow<mbs; brow++){ 58 start = ai[brow]; end = ai[brow+1]; 59 if (PetscBTLookup(table_in,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 60 for (l = start; l<end ; l++){ 61 bcol = aj[l]; 62 if (!PetscBTLookupSet(table_out,bcol)) {nidx[isz++] = bcol;} 63 } 64 k++; 65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 66 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 67 for (l = start; l<end ; l++){ 68 bcol = aj[l]; 69 if (bcol > bcol_max) break; 70 if (PetscBTLookup(table_in,bcol)){ 71 if (!PetscBTLookupSet(table_out,brow)) {nidx[isz++] = brow;} 72 break; /* for l = start; l<end ; l++) */ 73 } 74 } 75 } 76 } 77 } /* for each overlap */ 78 79 /* expand the Index Set */ 80 for (j=0; j<isz; j++) { 81 for (k=0; k<bs; k++) 82 nidx2[j*bs+k] = nidx[j]*bs+k; 83 } 84 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 85 } 86 ierr = PetscBTDestroy(table_out);CHKERRQ(ierr); 87 ierr = PetscFree(nidx);CHKERRQ(ierr); 88 ierr = PetscFree(nidx2);CHKERRQ(ierr); 89 ierr = PetscBTDestroy(table_in);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 #undef __FUNCT__ 94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 96 { 97 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 98 PetscErrorCode ierr; 99 PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens; 100 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 101 PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 102 const PetscInt *irow,*aj = a->j,*ai = a->i; 103 MatScalar *mat_a; 104 Mat C; 105 PetscBool flag,sorted; 106 107 PetscFunctionBegin; 108 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 109 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 110 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 111 112 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 113 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 114 115 ierr = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr); 116 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 117 ssmap = smap; 118 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 120 /* determine lens of each row */ 121 for (i=0; i<nrows; i++) { 122 kstart = ai[irow[i]]; 123 kend = kstart + a->ilen[irow[i]]; 124 lens[i] = 0; 125 for (k=kstart; k<kend; k++) { 126 if (ssmap[aj[k]]) { 127 lens[i]++; 128 } 129 } 130 } 131 /* Create and fill new matrix */ 132 if (scall == MAT_REUSE_MATRIX) { 133 c = (Mat_SeqSBAIJ *)((*B)->data); 134 135 if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 136 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 137 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 138 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 139 C = *B; 140 } else { 141 ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr); 142 ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 143 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 144 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr); 145 } 146 c = (Mat_SeqSBAIJ *)(C->data); 147 for (i=0; i<nrows; i++) { 148 row = irow[i]; 149 kstart = ai[row]; 150 kend = kstart + a->ilen[row]; 151 mat_i = c->i[i]; 152 mat_j = c->j + mat_i; 153 mat_a = c->a + mat_i*bs2; 154 mat_ilen = c->ilen + i; 155 for (k=kstart; k<kend; k++) { 156 if ((tcol=ssmap[a->j[k]])) { 157 *mat_j++ = tcol - 1; 158 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 159 mat_a += bs2; 160 (*mat_ilen)++; 161 } 162 } 163 } 164 165 /* Free work space */ 166 ierr = PetscFree(smap);CHKERRQ(ierr); 167 ierr = PetscFree(lens);CHKERRQ(ierr); 168 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 169 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 170 171 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 172 *B = C; 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 179 { 180 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 181 IS is1; 182 PetscErrorCode ierr; 183 PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count; 184 const PetscInt *irow; 185 186 PetscFunctionBegin; 187 if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro"); 188 189 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 190 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 191 192 /* Verify if the indices corespond to each element in a block 193 and form the IS with compressed IS */ 194 ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr); 195 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 196 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 197 198 count = 0; 199 for (i=0; i<a->mbs; i++) { 200 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks"); 201 if (vary[i]==bs) iary[count++] = i; 202 } 203 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 204 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 205 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 206 207 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr); 208 ierr = ISDestroy(&is1);CHKERRQ(ierr); 209 PetscFunctionReturn(0); 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 214 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 215 { 216 PetscErrorCode ierr; 217 PetscInt i; 218 219 PetscFunctionBegin; 220 if (scall == MAT_INITIAL_MATRIX) { 221 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 222 } 223 224 for (i=0; i<n; i++) { 225 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 226 } 227 PetscFunctionReturn(0); 228 } 229 230 /* -------------------------------------------------------*/ 231 /* Should check that shapes of vectors and matrices match */ 232 /* -------------------------------------------------------*/ 233 234 #undef __FUNCT__ 235 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 236 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 237 { 238 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 239 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 240 MatScalar *v; 241 PetscErrorCode ierr; 242 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 243 PetscInt nonzerorow=0; 244 245 PetscFunctionBegin; 246 ierr = VecSet(zz,zero);CHKERRQ(ierr); 247 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 248 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 249 250 v = a->a; 251 xb = x; 252 253 for (i=0; i<mbs; i++) { 254 n = ai[1] - ai[0]; /* length of i_th block row of A */ 255 x1 = xb[0]; x2 = xb[1]; 256 ib = aj + *ai; 257 jmin = 0; 258 nonzerorow += (n>0); 259 if (*ib == i){ /* (diag of A)*x */ 260 z[2*i] += v[0]*x1 + v[2]*x2; 261 z[2*i+1] += v[2]*x1 + v[3]*x2; 262 v += 4; jmin++; 263 } 264 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 265 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 266 for (j=jmin; j<n; j++) { 267 /* (strict lower triangular part of A)*x */ 268 cval = ib[j]*2; 269 z[cval] += v[0]*x1 + v[1]*x2; 270 z[cval+1] += v[2]*x1 + v[3]*x2; 271 /* (strict upper triangular part of A)*x */ 272 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 273 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 274 v += 4; 275 } 276 xb +=2; ai++; 277 } 278 279 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 280 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 281 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 #undef __FUNCT__ 286 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 287 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 288 { 289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 290 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 291 MatScalar *v; 292 PetscErrorCode ierr; 293 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 294 PetscInt nonzerorow=0; 295 296 PetscFunctionBegin; 297 ierr = VecSet(zz,zero);CHKERRQ(ierr); 298 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 299 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 300 301 v = a->a; 302 xb = x; 303 304 for (i=0; i<mbs; i++) { 305 n = ai[1] - ai[0]; /* length of i_th block row of A */ 306 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 307 ib = aj + *ai; 308 jmin = 0; 309 nonzerorow += (n>0); 310 if (*ib == i){ /* (diag of A)*x */ 311 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 312 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 313 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 314 v += 9; jmin++; 315 } 316 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 317 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 318 for (j=jmin; j<n; j++) { 319 /* (strict lower triangular part of A)*x */ 320 cval = ib[j]*3; 321 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 322 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 323 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 324 /* (strict upper triangular part of A)*x */ 325 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 326 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 327 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 328 v += 9; 329 } 330 xb +=3; ai++; 331 } 332 333 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 334 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 335 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 336 PetscFunctionReturn(0); 337 } 338 339 #undef __FUNCT__ 340 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 341 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 342 { 343 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 344 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 345 MatScalar *v; 346 PetscErrorCode ierr; 347 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 348 PetscInt nonzerorow=0; 349 350 PetscFunctionBegin; 351 ierr = VecSet(zz,zero);CHKERRQ(ierr); 352 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 353 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 354 355 v = a->a; 356 xb = x; 357 358 for (i=0; i<mbs; i++) { 359 n = ai[1] - ai[0]; /* length of i_th block row of A */ 360 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 361 ib = aj + *ai; 362 jmin = 0; 363 nonzerorow += (n>0); 364 if (*ib == i){ /* (diag of A)*x */ 365 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 366 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 367 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 368 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 369 v += 16; jmin++; 370 } 371 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 372 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 373 for (j=jmin; j<n; j++) { 374 /* (strict lower triangular part of A)*x */ 375 cval = ib[j]*4; 376 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 377 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 378 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 379 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 380 /* (strict upper triangular part of A)*x */ 381 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 382 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 383 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 384 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 385 v += 16; 386 } 387 xb +=4; ai++; 388 } 389 390 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 391 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 392 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 393 PetscFunctionReturn(0); 394 } 395 396 #undef __FUNCT__ 397 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 398 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 399 { 400 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 401 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 402 MatScalar *v; 403 PetscErrorCode ierr; 404 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 405 PetscInt nonzerorow=0; 406 407 PetscFunctionBegin; 408 ierr = VecSet(zz,zero);CHKERRQ(ierr); 409 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 411 412 v = a->a; 413 xb = x; 414 415 for (i=0; i<mbs; i++) { 416 n = ai[1] - ai[0]; /* length of i_th block row of A */ 417 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 418 ib = aj + *ai; 419 jmin = 0; 420 nonzerorow += (n>0); 421 if (*ib == i){ /* (diag of A)*x */ 422 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 423 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 424 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 425 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 426 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 427 v += 25; jmin++; 428 } 429 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 430 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 431 for (j=jmin; j<n; j++) { 432 /* (strict lower triangular part of A)*x */ 433 cval = ib[j]*5; 434 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 435 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 436 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 437 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 438 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 439 /* (strict upper triangular part of A)*x */ 440 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 441 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 442 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 443 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 444 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 445 v += 25; 446 } 447 xb +=5; ai++; 448 } 449 450 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 451 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 452 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 453 PetscFunctionReturn(0); 454 } 455 456 457 #undef __FUNCT__ 458 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 459 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 460 { 461 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 462 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 463 MatScalar *v; 464 PetscErrorCode ierr; 465 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 466 PetscInt nonzerorow=0; 467 468 PetscFunctionBegin; 469 ierr = VecSet(zz,zero);CHKERRQ(ierr); 470 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 471 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 472 473 v = a->a; 474 xb = x; 475 476 for (i=0; i<mbs; i++) { 477 n = ai[1] - ai[0]; /* length of i_th block row of A */ 478 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 479 ib = aj + *ai; 480 jmin = 0; 481 nonzerorow += (n>0); 482 if (*ib == i){ /* (diag of A)*x */ 483 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 484 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 485 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 486 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 487 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 488 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 489 v += 36; jmin++; 490 } 491 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 492 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 493 for (j=jmin; j<n; j++) { 494 /* (strict lower triangular part of A)*x */ 495 cval = ib[j]*6; 496 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 497 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 498 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 499 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 500 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 501 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 502 /* (strict upper triangular part of A)*x */ 503 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 504 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 505 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 506 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 507 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 508 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 509 v += 36; 510 } 511 xb +=6; ai++; 512 } 513 514 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 515 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 516 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 517 PetscFunctionReturn(0); 518 } 519 #undef __FUNCT__ 520 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 521 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 522 { 523 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 524 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 525 MatScalar *v; 526 PetscErrorCode ierr; 527 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 528 PetscInt nonzerorow=0; 529 530 PetscFunctionBegin; 531 ierr = VecSet(zz,zero);CHKERRQ(ierr); 532 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 533 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 534 535 v = a->a; 536 xb = x; 537 538 for (i=0; i<mbs; i++) { 539 n = ai[1] - ai[0]; /* length of i_th block row of A */ 540 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 541 ib = aj + *ai; 542 jmin = 0; 543 nonzerorow += (n>0); 544 if (*ib == i){ /* (diag of A)*x */ 545 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 546 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 547 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 548 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 549 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 550 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 551 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 552 v += 49; jmin++; 553 } 554 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 555 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 556 for (j=jmin; j<n; j++) { 557 /* (strict lower triangular part of A)*x */ 558 cval = ib[j]*7; 559 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 560 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 561 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 562 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 563 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 564 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 565 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 566 /* (strict upper triangular part of A)*x */ 567 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 568 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 569 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 570 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 571 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 572 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 573 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 574 v += 49; 575 } 576 xb +=7; ai++; 577 } 578 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 579 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 580 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr); 581 PetscFunctionReturn(0); 582 } 583 584 /* 585 This will not work with MatScalar == float because it calls the BLAS 586 */ 587 #undef __FUNCT__ 588 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 589 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 590 { 591 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 592 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 593 MatScalar *v; 594 PetscErrorCode ierr; 595 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 596 PetscInt nonzerorow=0; 597 598 PetscFunctionBegin; 599 ierr = VecSet(zz,zero);CHKERRQ(ierr); 600 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 601 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 602 603 aj = a->j; 604 v = a->a; 605 ii = a->i; 606 607 if (!a->mult_work) { 608 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 609 } 610 work = a->mult_work; 611 612 for (i=0; i<mbs; i++) { 613 n = ii[1] - ii[0]; ncols = n*bs; 614 workt = work; idx=aj+ii[0]; 615 nonzerorow += (n>0); 616 617 /* upper triangular part */ 618 for (j=0; j<n; j++) { 619 xb = x_ptr + bs*(*idx++); 620 for (k=0; k<bs; k++) workt[k] = xb[k]; 621 workt += bs; 622 } 623 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 624 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 625 626 /* strict lower triangular part */ 627 idx = aj+ii[0]; 628 if (*idx == i){ 629 ncols -= bs; v += bs2; idx++; n--; 630 } 631 632 if (ncols > 0){ 633 workt = work; 634 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 635 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 636 for (j=0; j<n; j++) { 637 zb = z_ptr + bs*(*idx++); 638 for (k=0; k<bs; k++) zb[k] += workt[k] ; 639 workt += bs; 640 } 641 } 642 x += bs; v += n*bs2; z += bs; ii++; 643 } 644 645 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 646 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 647 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr); 648 PetscFunctionReturn(0); 649 } 650 651 #undef __FUNCT__ 652 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 653 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 654 { 655 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 656 PetscScalar *x,*z,*xb,x1; 657 MatScalar *v; 658 PetscErrorCode ierr; 659 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 660 PetscInt nonzerorow=0; 661 662 PetscFunctionBegin; 663 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 664 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 665 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 666 v = a->a; 667 xb = x; 668 669 for (i=0; i<mbs; i++) { 670 n = ai[1] - ai[0]; /* length of i_th row of A */ 671 x1 = xb[0]; 672 ib = aj + *ai; 673 jmin = 0; 674 nonzerorow += (n>0); 675 if (*ib == i) { /* (diag of A)*x */ 676 z[i] += *v++ * x[*ib++]; jmin++; 677 } 678 for (j=jmin; j<n; j++) { 679 cval = *ib; 680 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 681 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 682 } 683 xb++; ai++; 684 } 685 686 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 687 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 688 689 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 690 PetscFunctionReturn(0); 691 } 692 693 #undef __FUNCT__ 694 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 695 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 696 { 697 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 698 PetscScalar *x,*z,*xb,x1,x2; 699 MatScalar *v; 700 PetscErrorCode ierr; 701 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 702 PetscInt nonzerorow=0; 703 704 PetscFunctionBegin; 705 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 706 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 707 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 708 709 v = a->a; 710 xb = x; 711 712 for (i=0; i<mbs; i++) { 713 n = ai[1] - ai[0]; /* length of i_th block row of A */ 714 x1 = xb[0]; x2 = xb[1]; 715 ib = aj + *ai; 716 jmin = 0; 717 nonzerorow += (n>0); 718 if (*ib == i){ /* (diag of A)*x */ 719 z[2*i] += v[0]*x1 + v[2]*x2; 720 z[2*i+1] += v[2]*x1 + v[3]*x2; 721 v += 4; jmin++; 722 } 723 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 724 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 725 for (j=jmin; j<n; j++) { 726 /* (strict lower triangular part of A)*x */ 727 cval = ib[j]*2; 728 z[cval] += v[0]*x1 + v[1]*x2; 729 z[cval+1] += v[2]*x1 + v[3]*x2; 730 /* (strict upper triangular part of A)*x */ 731 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 732 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 733 v += 4; 734 } 735 xb +=2; ai++; 736 } 737 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 738 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 739 740 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 741 PetscFunctionReturn(0); 742 } 743 744 #undef __FUNCT__ 745 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 746 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 747 { 748 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 749 PetscScalar *x,*z,*xb,x1,x2,x3; 750 MatScalar *v; 751 PetscErrorCode ierr; 752 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 753 PetscInt nonzerorow=0; 754 755 PetscFunctionBegin; 756 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 757 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 758 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 759 760 v = a->a; 761 xb = x; 762 763 for (i=0; i<mbs; i++) { 764 n = ai[1] - ai[0]; /* length of i_th block row of A */ 765 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 766 ib = aj + *ai; 767 jmin = 0; 768 nonzerorow += (n>0); 769 if (*ib == i){ /* (diag of A)*x */ 770 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 771 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 772 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 773 v += 9; jmin++; 774 } 775 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 776 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 777 for (j=jmin; j<n; j++) { 778 /* (strict lower triangular part of A)*x */ 779 cval = ib[j]*3; 780 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 781 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 782 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 783 /* (strict upper triangular part of A)*x */ 784 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 785 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 786 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 787 v += 9; 788 } 789 xb +=3; ai++; 790 } 791 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 794 795 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 796 PetscFunctionReturn(0); 797 } 798 799 #undef __FUNCT__ 800 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 801 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 802 { 803 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 804 PetscScalar *x,*z,*xb,x1,x2,x3,x4; 805 MatScalar *v; 806 PetscErrorCode ierr; 807 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 808 PetscInt nonzerorow=0; 809 810 PetscFunctionBegin; 811 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 812 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 813 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 814 815 v = a->a; 816 xb = x; 817 818 for (i=0; i<mbs; i++) { 819 n = ai[1] - ai[0]; /* length of i_th block row of A */ 820 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 821 ib = aj + *ai; 822 jmin = 0; 823 nonzerorow += (n>0); 824 if (*ib == i){ /* (diag of A)*x */ 825 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 826 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 827 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 828 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 829 v += 16; jmin++; 830 } 831 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 832 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 833 for (j=jmin; j<n; j++) { 834 /* (strict lower triangular part of A)*x */ 835 cval = ib[j]*4; 836 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 837 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 838 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 839 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 840 /* (strict upper triangular part of A)*x */ 841 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 842 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 843 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 844 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 845 v += 16; 846 } 847 xb +=4; ai++; 848 } 849 850 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 851 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 852 853 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 854 PetscFunctionReturn(0); 855 } 856 857 #undef __FUNCT__ 858 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 859 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 860 { 861 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 862 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5; 863 MatScalar *v; 864 PetscErrorCode ierr; 865 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 866 PetscInt nonzerorow=0; 867 868 PetscFunctionBegin; 869 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 870 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 871 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 872 873 v = a->a; 874 xb = x; 875 876 for (i=0; i<mbs; i++) { 877 n = ai[1] - ai[0]; /* length of i_th block row of A */ 878 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 879 ib = aj + *ai; 880 jmin = 0; 881 nonzerorow += (n>0); 882 if (*ib == i){ /* (diag of A)*x */ 883 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 884 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 885 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 886 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 887 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 888 v += 25; jmin++; 889 } 890 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 891 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 892 for (j=jmin; j<n; j++) { 893 /* (strict lower triangular part of A)*x */ 894 cval = ib[j]*5; 895 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 896 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 897 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 898 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 899 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 900 /* (strict upper triangular part of A)*x */ 901 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 902 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 903 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 904 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 905 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 906 v += 25; 907 } 908 xb +=5; ai++; 909 } 910 911 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 912 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 913 914 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 915 PetscFunctionReturn(0); 916 } 917 #undef __FUNCT__ 918 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 919 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 920 { 921 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 922 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6; 923 MatScalar *v; 924 PetscErrorCode ierr; 925 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 926 PetscInt nonzerorow=0; 927 928 PetscFunctionBegin; 929 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 930 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 931 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 932 933 v = a->a; 934 xb = x; 935 936 for (i=0; i<mbs; i++) { 937 n = ai[1] - ai[0]; /* length of i_th block row of A */ 938 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 939 ib = aj + *ai; 940 jmin = 0; 941 nonzerorow += (n>0); 942 if (*ib == i){ /* (diag of A)*x */ 943 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 944 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 945 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 946 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 947 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 948 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 949 v += 36; jmin++; 950 } 951 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 952 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 953 for (j=jmin; j<n; j++) { 954 /* (strict lower triangular part of A)*x */ 955 cval = ib[j]*6; 956 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 957 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 958 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 959 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 960 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 961 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 962 /* (strict upper triangular part of A)*x */ 963 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 964 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 965 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 966 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 967 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 968 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 969 v += 36; 970 } 971 xb +=6; ai++; 972 } 973 974 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 975 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 976 977 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 978 PetscFunctionReturn(0); 979 } 980 981 #undef __FUNCT__ 982 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 983 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 984 { 985 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 986 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 987 MatScalar *v; 988 PetscErrorCode ierr; 989 PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin; 990 PetscInt nonzerorow=0; 991 992 PetscFunctionBegin; 993 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 994 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 995 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 996 997 v = a->a; 998 xb = x; 999 1000 for (i=0; i<mbs; i++) { 1001 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1002 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1003 ib = aj + *ai; 1004 jmin = 0; 1005 nonzerorow += (n>0); 1006 if (*ib == i){ /* (diag of A)*x */ 1007 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1008 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1009 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1010 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1011 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1012 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1013 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1014 v += 49; jmin++; 1015 } 1016 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1017 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1018 for (j=jmin; j<n; j++) { 1019 /* (strict lower triangular part of A)*x */ 1020 cval = ib[j]*7; 1021 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1022 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1023 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1024 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1025 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1026 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1027 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1028 /* (strict upper triangular part of A)*x */ 1029 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1030 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1031 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1032 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1033 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1034 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1035 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1036 v += 49; 1037 } 1038 xb +=7; ai++; 1039 } 1040 1041 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1042 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1043 1044 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1045 PetscFunctionReturn(0); 1046 } 1047 1048 #undef __FUNCT__ 1049 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1050 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1051 { 1052 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1053 PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1054 MatScalar *v; 1055 PetscErrorCode ierr; 1056 PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k; 1057 PetscInt nonzerorow=0; 1058 1059 PetscFunctionBegin; 1060 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1061 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1062 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1063 1064 aj = a->j; 1065 v = a->a; 1066 ii = a->i; 1067 1068 if (!a->mult_work) { 1069 ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1070 } 1071 work = a->mult_work; 1072 1073 1074 for (i=0; i<mbs; i++) { 1075 n = ii[1] - ii[0]; ncols = n*bs; 1076 workt = work; idx=aj+ii[0]; 1077 nonzerorow += (n>0); 1078 1079 /* upper triangular part */ 1080 for (j=0; j<n; j++) { 1081 xb = x_ptr + bs*(*idx++); 1082 for (k=0; k<bs; k++) workt[k] = xb[k]; 1083 workt += bs; 1084 } 1085 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1086 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1087 1088 /* strict lower triangular part */ 1089 idx = aj+ii[0]; 1090 if (*idx == i){ 1091 ncols -= bs; v += bs2; idx++; n--; 1092 } 1093 if (ncols > 0){ 1094 workt = work; 1095 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1096 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1097 for (j=0; j<n; j++) { 1098 zb = z_ptr + bs*(*idx++); 1099 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1100 workt += bs; 1101 } 1102 } 1103 1104 x += bs; v += n*bs2; z += bs; ii++; 1105 } 1106 1107 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1108 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1109 1110 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr); 1111 PetscFunctionReturn(0); 1112 } 1113 1114 #undef __FUNCT__ 1115 #define __FUNCT__ "MatScale_SeqSBAIJ" 1116 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha) 1117 { 1118 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1119 PetscScalar oalpha = alpha; 1120 PetscErrorCode ierr; 1121 PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz); 1122 1123 PetscFunctionBegin; 1124 BLASscal_(&totalnz,&oalpha,a->a,&one); 1125 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1126 PetscFunctionReturn(0); 1127 } 1128 1129 #undef __FUNCT__ 1130 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1131 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1132 { 1133 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1134 MatScalar *v = a->a; 1135 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1136 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1137 PetscErrorCode ierr; 1138 PetscInt *jl,*il,jmin,jmax,nexti,ik,*col; 1139 1140 PetscFunctionBegin; 1141 if (type == NORM_FROBENIUS) { 1142 for (k=0; k<mbs; k++){ 1143 jmin = a->i[k]; jmax = a->i[k+1]; 1144 col = aj + jmin; 1145 if (*col == k){ /* diagonal block */ 1146 for (i=0; i<bs2; i++){ 1147 #if defined(PETSC_USE_COMPLEX) 1148 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1149 #else 1150 sum_diag += (*v)*(*v); v++; 1151 #endif 1152 } 1153 jmin++; 1154 } 1155 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1156 for (i=0; i<bs2; i++){ 1157 #if defined(PETSC_USE_COMPLEX) 1158 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1159 #else 1160 sum_off += (*v)*(*v); v++; 1161 #endif 1162 } 1163 } 1164 } 1165 *norm = sqrt(sum_diag + 2*sum_off); 1166 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */ 1167 ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr); 1168 for (i=0; i<mbs; i++) jl[i] = mbs; 1169 il[0] = 0; 1170 1171 *norm = 0.0; 1172 for (k=0; k<mbs; k++) { /* k_th block row */ 1173 for (j=0; j<bs; j++) sum[j]=0.0; 1174 /*-- col sum --*/ 1175 i = jl[k]; /* first |A(i,k)| to be added */ 1176 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1177 at step k */ 1178 while (i<mbs){ 1179 nexti = jl[i]; /* next block row to be added */ 1180 ik = il[i]; /* block index of A(i,k) in the array a */ 1181 for (j=0; j<bs; j++){ 1182 v = a->a + ik*bs2 + j*bs; 1183 for (k1=0; k1<bs; k1++) { 1184 sum[j] += PetscAbsScalar(*v); v++; 1185 } 1186 } 1187 /* update il, jl */ 1188 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1189 jmax = a->i[i+1]; 1190 if (jmin < jmax){ 1191 il[i] = jmin; 1192 j = a->j[jmin]; 1193 jl[i] = jl[j]; jl[j]=i; 1194 } 1195 i = nexti; 1196 } 1197 /*-- row sum --*/ 1198 jmin = a->i[k]; jmax = a->i[k+1]; 1199 for (i=jmin; i<jmax; i++) { 1200 for (j=0; j<bs; j++){ 1201 v = a->a + i*bs2 + j; 1202 for (k1=0; k1<bs; k1++){ 1203 sum[j] += PetscAbsScalar(*v); v += bs; 1204 } 1205 } 1206 } 1207 /* add k_th block row to il, jl */ 1208 col = aj+jmin; 1209 if (*col == k) jmin++; 1210 if (jmin < jmax){ 1211 il[k] = jmin; 1212 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k; 1213 } 1214 for (j=0; j<bs; j++){ 1215 if (sum[j] > *norm) *norm = sum[j]; 1216 } 1217 } 1218 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr); 1219 } else { 1220 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 1221 } 1222 PetscFunctionReturn(0); 1223 } 1224 1225 #undef __FUNCT__ 1226 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1227 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg) 1228 { 1229 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1230 PetscErrorCode ierr; 1231 1232 PetscFunctionBegin; 1233 1234 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1235 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 1236 *flg = PETSC_FALSE; 1237 PetscFunctionReturn(0); 1238 } 1239 1240 /* if the a->i are the same */ 1241 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1242 if (!*flg) { 1243 PetscFunctionReturn(0); 1244 } 1245 1246 /* if a->j are the same */ 1247 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 1248 if (!*flg) { 1249 PetscFunctionReturn(0); 1250 } 1251 /* if a->a are the same */ 1252 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1253 PetscFunctionReturn(0); 1254 } 1255 1256 #undef __FUNCT__ 1257 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1258 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1259 { 1260 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1261 PetscErrorCode ierr; 1262 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1263 PetscScalar *x,zero = 0.0; 1264 MatScalar *aa,*aa_j; 1265 1266 PetscFunctionBegin; 1267 bs = A->rmap->bs; 1268 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1269 1270 aa = a->a; 1271 ai = a->i; 1272 aj = a->j; 1273 ambs = a->mbs; 1274 bs2 = a->bs2; 1275 1276 ierr = VecSet(v,zero);CHKERRQ(ierr); 1277 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1278 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1279 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1280 for (i=0; i<ambs; i++) { 1281 j=ai[i]; 1282 if (aj[j] == i) { /* if this is a diagonal element */ 1283 row = i*bs; 1284 aa_j = aa + j*bs2; 1285 if (A->factortype && bs==1){ 1286 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1287 } else { 1288 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1289 } 1290 } 1291 } 1292 1293 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1294 PetscFunctionReturn(0); 1295 } 1296 1297 #undef __FUNCT__ 1298 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1299 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1300 { 1301 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1302 PetscScalar *l,x,*li,*ri; 1303 MatScalar *aa,*v; 1304 PetscErrorCode ierr; 1305 PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1306 PetscBool flg; 1307 1308 PetscFunctionBegin; 1309 if (ll != rr){ 1310 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr); 1311 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1312 } 1313 if (!ll) PetscFunctionReturn(0); 1314 ai = a->i; 1315 aj = a->j; 1316 aa = a->a; 1317 m = A->rmap->N; 1318 bs = A->rmap->bs; 1319 mbs = a->mbs; 1320 bs2 = a->bs2; 1321 1322 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1323 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1324 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1325 for (i=0; i<mbs; i++) { /* for each block row */ 1326 M = ai[i+1] - ai[i]; 1327 li = l + i*bs; 1328 v = aa + bs2*ai[i]; 1329 for (j=0; j<M; j++) { /* for each block */ 1330 ri = l + bs*aj[ai[i]+j]; 1331 for (k=0; k<bs; k++) { 1332 x = ri[k]; 1333 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1334 } 1335 } 1336 } 1337 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1338 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1339 PetscFunctionReturn(0); 1340 } 1341 1342 #undef __FUNCT__ 1343 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1344 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1345 { 1346 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1347 1348 PetscFunctionBegin; 1349 info->block_size = a->bs2; 1350 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */ 1351 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1352 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1353 info->assemblies = A->num_ass; 1354 info->mallocs = A->info.mallocs; 1355 info->memory = ((PetscObject)A)->mem; 1356 if (A->factortype) { 1357 info->fill_ratio_given = A->info.fill_ratio_given; 1358 info->fill_ratio_needed = A->info.fill_ratio_needed; 1359 info->factor_mallocs = A->info.factor_mallocs; 1360 } else { 1361 info->fill_ratio_given = 0; 1362 info->fill_ratio_needed = 0; 1363 info->factor_mallocs = 0; 1364 } 1365 PetscFunctionReturn(0); 1366 } 1367 1368 1369 #undef __FUNCT__ 1370 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1371 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A) 1372 { 1373 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1374 PetscErrorCode ierr; 1375 1376 PetscFunctionBegin; 1377 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1378 PetscFunctionReturn(0); 1379 } 1380 1381 #undef __FUNCT__ 1382 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ" 1383 /* 1384 This code does not work since it only checks the upper triangular part of 1385 the matrix. Hence it is not listed in the function table. 1386 */ 1387 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[]) 1388 { 1389 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1390 PetscErrorCode ierr; 1391 PetscInt i,j,n,row,col,bs,*ai,*aj,mbs; 1392 PetscReal atmp; 1393 MatScalar *aa; 1394 PetscScalar *x; 1395 PetscInt ncols,brow,bcol,krow,kcol; 1396 1397 PetscFunctionBegin; 1398 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov"); 1399 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1400 bs = A->rmap->bs; 1401 aa = a->a; 1402 ai = a->i; 1403 aj = a->j; 1404 mbs = a->mbs; 1405 1406 ierr = VecSet(v,0.0);CHKERRQ(ierr); 1407 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1408 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1409 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1410 for (i=0; i<mbs; i++) { 1411 ncols = ai[1] - ai[0]; ai++; 1412 brow = bs*i; 1413 for (j=0; j<ncols; j++){ 1414 bcol = bs*(*aj); 1415 for (kcol=0; kcol<bs; kcol++){ 1416 col = bcol + kcol; /* col index */ 1417 for (krow=0; krow<bs; krow++){ 1418 atmp = PetscAbsScalar(*aa); aa++; 1419 row = brow + krow; /* row index */ 1420 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1421 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1422 } 1423 } 1424 aj++; 1425 } 1426 } 1427 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1428 PetscFunctionReturn(0); 1429 } 1430