1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 int brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol, 15 start,end,*ai,*aj,bs,*nidx2; 16 PetscBT table; 17 PetscBT table0; 18 19 PetscFunctionBegin; 20 mbs = a->mbs; 21 ai = a->i; 22 aj = a->j; 23 bs = a->bs; 24 25 if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 27 ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr); 28 ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 29 ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr); 30 ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr); 31 32 for (i=0; i<is_max; i++) { /* for each is */ 33 isz = 0; 34 ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr); 35 36 /* Extract the indices, assume there can be duplicate entries */ 37 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 38 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 39 40 /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */ 41 for (j=0; j<n ; ++j){ 42 brow = idx[j]/bs; /* convert the indices into block indices */ 43 if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 44 if(!PetscBTLookupSet(table,brow)) { nidx[isz++] = brow;} 45 } 46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 47 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 48 49 k = 0; 50 for (j=0; j<ov; j++){ /* for each overlap */ 51 /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */ 52 ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr); 53 for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]); 54 55 n = isz; /* length of the updated is[i] */ 56 for (brow=0; brow<mbs; brow++){ 57 start = ai[brow]; end = ai[brow+1]; 58 if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */ 59 for (l = start; l<end ; l++){ 60 bcol = aj[l]; 61 if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;} 62 } 63 k++; 64 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */ 65 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */ 66 for (l = start; l<end ; l++){ 67 bcol = aj[l]; 68 if (PetscBTLookup(table0,bcol)){ 69 if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;} 70 break; /* for l = start; l<end ; l++) */ 71 } 72 } 73 } 74 } 75 } /* for each overlap */ 76 77 /* expand the Index Set */ 78 for (j=0; j<isz; j++) { 79 for (k=0; k<bs; k++) 80 nidx2[j*bs+k] = nidx[j]*bs+k; 81 } 82 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr); 83 } 84 ierr = PetscBTDestroy(table);CHKERRQ(ierr); 85 ierr = PetscFree(nidx);CHKERRQ(ierr); 86 ierr = PetscFree(nidx2);CHKERRQ(ierr); 87 ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 88 PetscFunctionReturn(0); 89 } 90 91 #undef __FUNCT__ 92 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 93 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 94 { 95 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 96 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 97 int row,mat_i,*mat_j,tcol,*mat_ilen; 98 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 99 int *aj = a->j,*ai = a->i; 100 MatScalar *mat_a; 101 Mat C; 102 PetscTruth flag; 103 104 PetscFunctionBegin; 105 106 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 107 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 108 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 109 110 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 111 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 112 113 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 114 ssmap = smap; 115 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 116 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 117 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 118 /* determine lens of each row */ 119 for (i=0; i<nrows; i++) { 120 kstart = ai[irow[i]]; 121 kend = kstart + a->ilen[irow[i]]; 122 lens[i] = 0; 123 for (k=kstart; k<kend; k++) { 124 if (ssmap[aj[k]]) { 125 lens[i]++; 126 } 127 } 128 } 129 /* Create and fill new matrix */ 130 if (scall == MAT_REUSE_MATRIX) { 131 c = (Mat_SeqSBAIJ *)((*B)->data); 132 133 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 134 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 135 if (flag == PETSC_FALSE) { 136 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 137 } 138 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 139 C = *B; 140 } else { 141 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 142 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 143 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 144 } 145 c = (Mat_SeqSBAIJ *)(C->data); 146 for (i=0; i<nrows; i++) { 147 row = irow[i]; 148 kstart = ai[row]; 149 kend = kstart + a->ilen[row]; 150 mat_i = c->i[i]; 151 mat_j = c->j + mat_i; 152 mat_a = c->a + mat_i*bs2; 153 mat_ilen = c->ilen + i; 154 for (k=kstart; k<kend; k++) { 155 if ((tcol=ssmap[a->j[k]])) { 156 *mat_j++ = tcol - 1; 157 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 158 mat_a += bs2; 159 (*mat_ilen)++; 160 } 161 } 162 } 163 164 /* Free work space */ 165 ierr = PetscFree(smap);CHKERRQ(ierr); 166 ierr = PetscFree(lens);CHKERRQ(ierr); 167 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 168 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 169 170 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 171 *B = C; 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 177 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 178 { 179 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 180 IS is1; 181 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 182 183 PetscFunctionBegin; 184 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 185 186 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 187 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 188 189 /* Verify if the indices corespond to each element in a block 190 and form the IS with compressed IS */ 191 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 192 iary = vary + a->mbs; 193 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 194 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 195 196 count = 0; 197 for (i=0; i<a->mbs; i++) { 198 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 199 if (vary[i]==bs) iary[count++] = i; 200 } 201 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 202 203 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 204 ierr = PetscFree(vary);CHKERRQ(ierr); 205 206 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 207 ISDestroy(is1); 208 PetscFunctionReturn(0); 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 213 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 214 { 215 int ierr,i; 216 217 PetscFunctionBegin; 218 if (scall == MAT_INITIAL_MATRIX) { 219 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 220 } 221 222 for (i=0; i<n; i++) { 223 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 224 } 225 PetscFunctionReturn(0); 226 } 227 228 /* -------------------------------------------------------*/ 229 /* Should check that shapes of vectors and matrices match */ 230 /* -------------------------------------------------------*/ 231 #include "petscblaslapack.h" 232 233 #undef __FUNCT__ 234 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 235 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 236 { 237 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 238 PetscScalar *x,*z,*xb,x1,zero=0.0; 239 MatScalar *v; 240 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 241 242 PetscFunctionBegin; 243 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 244 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 245 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 246 247 v = a->a; 248 xb = x; 249 250 for (i=0; i<mbs; i++) { 251 n = ai[1] - ai[0]; /* length of i_th row of A */ 252 x1 = xb[0]; 253 ib = aj + *ai; 254 jmin = 0; 255 if (*ib == i) { /* (diag of A)*x */ 256 z[i] += *v++ * x[*ib++]; 257 jmin++; 258 } 259 for (j=jmin; j<n; j++) { 260 cval = *ib; 261 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 262 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 263 } 264 xb++; ai++; 265 } 266 267 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 268 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 269 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 270 PetscFunctionReturn(0); 271 } 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 275 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 276 { 277 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 278 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 279 MatScalar *v; 280 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 281 282 283 PetscFunctionBegin; 284 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 285 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 286 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 287 288 v = a->a; 289 xb = x; 290 291 for (i=0; i<mbs; i++) { 292 n = ai[1] - ai[0]; /* length of i_th block row of A */ 293 x1 = xb[0]; x2 = xb[1]; 294 ib = aj + *ai; 295 jmin = 0; 296 if (*ib == i){ /* (diag of A)*x */ 297 z[2*i] += v[0]*x1 + v[2]*x2; 298 z[2*i+1] += v[2]*x1 + v[3]*x2; 299 v += 4; jmin++; 300 } 301 for (j=jmin; j<n; j++) { 302 /* (strict lower triangular part of A)*x */ 303 cval = ib[j]*2; 304 z[cval] += v[0]*x1 + v[1]*x2; 305 z[cval+1] += v[2]*x1 + v[3]*x2; 306 /* (strict upper triangular part of A)*x */ 307 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 308 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 309 v += 4; 310 } 311 xb +=2; ai++; 312 } 313 314 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 315 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 316 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 317 PetscFunctionReturn(0); 318 } 319 320 #undef __FUNCT__ 321 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 322 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 323 { 324 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 325 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 326 MatScalar *v; 327 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 328 329 330 PetscFunctionBegin; 331 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 332 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 333 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 334 335 v = a->a; 336 xb = x; 337 338 for (i=0; i<mbs; i++) { 339 n = ai[1] - ai[0]; /* length of i_th block row of A */ 340 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 341 ib = aj + *ai; 342 jmin = 0; 343 if (*ib == i){ /* (diag of A)*x */ 344 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 345 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 346 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 347 v += 9; jmin++; 348 } 349 for (j=jmin; j<n; j++) { 350 /* (strict lower triangular part of A)*x */ 351 cval = ib[j]*3; 352 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 353 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 354 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 355 /* (strict upper triangular part of A)*x */ 356 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 357 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 358 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 359 v += 9; 360 } 361 xb +=3; ai++; 362 } 363 364 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 365 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 366 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 372 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 373 { 374 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 375 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 376 MatScalar *v; 377 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 378 379 PetscFunctionBegin; 380 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 383 384 v = a->a; 385 xb = x; 386 387 for (i=0; i<mbs; i++) { 388 n = ai[1] - ai[0]; /* length of i_th block row of A */ 389 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 390 ib = aj + *ai; 391 jmin = 0; 392 if (*ib == i){ /* (diag of A)*x */ 393 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 394 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 395 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 396 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 397 v += 16; jmin++; 398 } 399 for (j=jmin; j<n; j++) { 400 /* (strict lower triangular part of A)*x */ 401 cval = ib[j]*4; 402 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 403 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 404 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 405 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 406 /* (strict upper triangular part of A)*x */ 407 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 408 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 409 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 410 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 411 v += 16; 412 } 413 xb +=4; ai++; 414 } 415 416 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 417 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 418 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 419 PetscFunctionReturn(0); 420 } 421 422 #undef __FUNCT__ 423 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 424 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 425 { 426 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 427 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 428 MatScalar *v; 429 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 430 431 PetscFunctionBegin; 432 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 433 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 434 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 435 436 v = a->a; 437 xb = x; 438 439 for (i=0; i<mbs; i++) { 440 n = ai[1] - ai[0]; /* length of i_th block row of A */ 441 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 442 ib = aj + *ai; 443 jmin = 0; 444 if (*ib == i){ /* (diag of A)*x */ 445 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 446 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 447 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 448 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 449 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 450 v += 25; jmin++; 451 } 452 for (j=jmin; j<n; j++) { 453 /* (strict lower triangular part of A)*x */ 454 cval = ib[j]*5; 455 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 456 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 457 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 458 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 459 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 460 /* (strict upper triangular part of A)*x */ 461 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 462 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 463 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 464 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 465 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 466 v += 25; 467 } 468 xb +=5; ai++; 469 } 470 471 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 472 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 473 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 474 PetscFunctionReturn(0); 475 } 476 477 478 #undef __FUNCT__ 479 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 480 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 481 { 482 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 483 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 484 MatScalar *v; 485 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 486 487 PetscFunctionBegin; 488 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 489 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 490 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 491 492 v = a->a; 493 xb = x; 494 495 for (i=0; i<mbs; i++) { 496 n = ai[1] - ai[0]; /* length of i_th block row of A */ 497 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 498 ib = aj + *ai; 499 jmin = 0; 500 if (*ib == i){ /* (diag of A)*x */ 501 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 502 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 503 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 504 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 505 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 506 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 507 v += 36; jmin++; 508 } 509 for (j=jmin; j<n; j++) { 510 /* (strict lower triangular part of A)*x */ 511 cval = ib[j]*6; 512 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 513 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 514 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 515 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 516 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 517 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 518 /* (strict upper triangular part of A)*x */ 519 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 520 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 521 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 522 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 523 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 524 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 525 v += 36; 526 } 527 xb +=6; ai++; 528 } 529 530 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 531 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 532 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 533 PetscFunctionReturn(0); 534 } 535 #undef __FUNCT__ 536 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 537 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 538 { 539 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 540 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 541 MatScalar *v; 542 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 543 544 PetscFunctionBegin; 545 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 546 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 547 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 548 549 v = a->a; 550 xb = x; 551 552 for (i=0; i<mbs; i++) { 553 n = ai[1] - ai[0]; /* length of i_th block row of A */ 554 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 555 ib = aj + *ai; 556 jmin = 0; 557 if (*ib == i){ /* (diag of A)*x */ 558 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 559 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 560 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 561 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 562 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 563 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 564 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 565 v += 49; jmin++; 566 } 567 for (j=jmin; j<n; j++) { 568 /* (strict lower triangular part of A)*x */ 569 cval = ib[j]*7; 570 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 571 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 572 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 573 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 574 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 575 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 576 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 577 /* (strict upper triangular part of A)*x */ 578 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 579 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 580 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 581 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 582 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 583 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 584 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 585 v += 49; 586 } 587 xb +=7; ai++; 588 } 589 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 590 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 591 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 592 PetscFunctionReturn(0); 593 } 594 595 /* 596 This will not work with MatScalar == float because it calls the BLAS 597 */ 598 #undef __FUNCT__ 599 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 600 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 601 { 602 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 603 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 604 MatScalar *v; 605 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 606 int ncols,k; 607 608 PetscFunctionBegin; 609 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 610 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 611 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 612 613 aj = a->j; 614 v = a->a; 615 ii = a->i; 616 617 if (!a->mult_work) { 618 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 619 } 620 work = a->mult_work; 621 622 for (i=0; i<mbs; i++) { 623 n = ii[1] - ii[0]; ncols = n*bs; 624 workt = work; idx=aj+ii[0]; 625 626 /* upper triangular part */ 627 for (j=0; j<n; j++) { 628 xb = x_ptr + bs*(*idx++); 629 for (k=0; k<bs; k++) workt[k] = xb[k]; 630 workt += bs; 631 } 632 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 633 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 634 635 /* strict lower triangular part */ 636 idx = aj+ii[0]; 637 if (*idx == i){ 638 ncols -= bs; v += bs2; idx++; n--; 639 } 640 641 if (ncols > 0){ 642 workt = work; 643 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 644 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 645 for (j=0; j<n; j++) { 646 zb = z_ptr + bs*(*idx++); 647 for (k=0; k<bs; k++) zb[k] += workt[k] ; 648 workt += bs; 649 } 650 } 651 x += bs; v += n*bs2; z += bs; ii++; 652 } 653 654 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 655 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 656 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 657 PetscFunctionReturn(0); 658 } 659 660 #undef __FUNCT__ 661 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 662 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 663 { 664 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 665 PetscScalar *x,*y,*z,*xb,x1; 666 MatScalar *v; 667 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 668 669 PetscFunctionBegin; 670 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 671 if (yy != xx) { 672 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 673 } else { 674 y = x; 675 } 676 if (zz != yy) { 677 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 678 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 679 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 680 } else { 681 z = y; 682 } 683 684 v = a->a; 685 xb = x; 686 687 for (i=0; i<mbs; i++) { 688 n = ai[1] - ai[0]; /* length of i_th row of A */ 689 x1 = xb[0]; 690 ib = aj + *ai; 691 jmin = 0; 692 if (*ib == i) { /* (diag of A)*x */ 693 z[i] += *v++ * x[*ib++]; jmin++; 694 } 695 for (j=jmin; j<n; j++) { 696 cval = *ib; 697 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 698 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 699 } 700 xb++; ai++; 701 } 702 703 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 704 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 705 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 706 707 PetscLogFlops(2*(a->nz*2 - A->m)); 708 PetscFunctionReturn(0); 709 } 710 711 #undef __FUNCT__ 712 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 713 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 714 { 715 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 716 PetscScalar *x,*y,*z,*xb,x1,x2; 717 MatScalar *v; 718 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 719 720 PetscFunctionBegin; 721 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 722 if (yy != xx) { 723 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 724 } else { 725 y = x; 726 } 727 if (zz != yy) { 728 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 729 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 730 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 731 } else { 732 z = y; 733 } 734 735 v = a->a; 736 xb = x; 737 738 for (i=0; i<mbs; i++) { 739 n = ai[1] - ai[0]; /* length of i_th block row of A */ 740 x1 = xb[0]; x2 = xb[1]; 741 ib = aj + *ai; 742 jmin = 0; 743 if (*ib == i){ /* (diag of A)*x */ 744 z[2*i] += v[0]*x1 + v[2]*x2; 745 z[2*i+1] += v[2]*x1 + v[3]*x2; 746 v += 4; jmin++; 747 } 748 for (j=jmin; j<n; j++) { 749 /* (strict lower triangular part of A)*x */ 750 cval = ib[j]*2; 751 z[cval] += v[0]*x1 + v[1]*x2; 752 z[cval+1] += v[2]*x1 + v[3]*x2; 753 /* (strict upper triangular part of A)*x */ 754 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 755 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 756 v += 4; 757 } 758 xb +=2; ai++; 759 } 760 761 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 762 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 763 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 764 765 PetscLogFlops(4*(a->nz*2 - A->m)); 766 PetscFunctionReturn(0); 767 } 768 769 #undef __FUNCT__ 770 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 771 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 772 { 773 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 774 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 775 MatScalar *v; 776 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 777 778 PetscFunctionBegin; 779 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 780 if (yy != xx) { 781 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 782 } else { 783 y = x; 784 } 785 if (zz != yy) { 786 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 787 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 788 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 789 } else { 790 z = y; 791 } 792 793 v = a->a; 794 xb = x; 795 796 for (i=0; i<mbs; i++) { 797 n = ai[1] - ai[0]; /* length of i_th block row of A */ 798 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 799 ib = aj + *ai; 800 jmin = 0; 801 if (*ib == i){ /* (diag of A)*x */ 802 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 803 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 804 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 805 v += 9; jmin++; 806 } 807 for (j=jmin; j<n; j++) { 808 /* (strict lower triangular part of A)*x */ 809 cval = ib[j]*3; 810 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 811 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 812 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 813 /* (strict upper triangular part of A)*x */ 814 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 815 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 816 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 817 v += 9; 818 } 819 xb +=3; ai++; 820 } 821 822 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 823 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 824 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 825 826 PetscLogFlops(18*(a->nz*2 - A->m)); 827 PetscFunctionReturn(0); 828 } 829 830 #undef __FUNCT__ 831 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 832 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 833 { 834 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 835 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 836 MatScalar *v; 837 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 838 839 PetscFunctionBegin; 840 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 841 if (yy != xx) { 842 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 843 } else { 844 y = x; 845 } 846 if (zz != yy) { 847 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 848 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 849 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 850 } else { 851 z = y; 852 } 853 854 v = a->a; 855 xb = x; 856 857 for (i=0; i<mbs; i++) { 858 n = ai[1] - ai[0]; /* length of i_th block row of A */ 859 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 860 ib = aj + *ai; 861 jmin = 0; 862 if (*ib == i){ /* (diag of A)*x */ 863 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 864 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 865 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 866 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 867 v += 16; jmin++; 868 } 869 for (j=jmin; j<n; j++) { 870 /* (strict lower triangular part of A)*x */ 871 cval = ib[j]*4; 872 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 873 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 874 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 875 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 876 /* (strict upper triangular part of A)*x */ 877 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 878 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 879 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 880 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 881 v += 16; 882 } 883 xb +=4; ai++; 884 } 885 886 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 887 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 888 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 889 890 PetscLogFlops(32*(a->nz*2 - A->m)); 891 PetscFunctionReturn(0); 892 } 893 894 #undef __FUNCT__ 895 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 896 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 897 { 898 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 899 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 900 MatScalar *v; 901 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 902 903 PetscFunctionBegin; 904 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 905 if (yy != xx) { 906 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 907 } else { 908 y = x; 909 } 910 if (zz != yy) { 911 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 912 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 913 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 914 } else { 915 z = y; 916 } 917 918 v = a->a; 919 xb = x; 920 921 for (i=0; i<mbs; i++) { 922 n = ai[1] - ai[0]; /* length of i_th block row of A */ 923 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 924 ib = aj + *ai; 925 jmin = 0; 926 if (*ib == i){ /* (diag of A)*x */ 927 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 928 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 929 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 930 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 931 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 932 v += 25; jmin++; 933 } 934 for (j=jmin; j<n; j++) { 935 /* (strict lower triangular part of A)*x */ 936 cval = ib[j]*5; 937 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 938 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 939 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 940 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 941 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 942 /* (strict upper triangular part of A)*x */ 943 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 944 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 945 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 946 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 947 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 948 v += 25; 949 } 950 xb +=5; ai++; 951 } 952 953 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 954 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 955 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 956 957 PetscLogFlops(50*(a->nz*2 - A->m)); 958 PetscFunctionReturn(0); 959 } 960 #undef __FUNCT__ 961 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 962 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 963 { 964 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 965 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 966 MatScalar *v; 967 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 968 969 PetscFunctionBegin; 970 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 971 if (yy != xx) { 972 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 973 } else { 974 y = x; 975 } 976 if (zz != yy) { 977 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 978 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 979 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 980 } else { 981 z = y; 982 } 983 984 v = a->a; 985 xb = x; 986 987 for (i=0; i<mbs; i++) { 988 n = ai[1] - ai[0]; /* length of i_th block row of A */ 989 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 990 ib = aj + *ai; 991 jmin = 0; 992 if (*ib == i){ /* (diag of A)*x */ 993 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 994 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 995 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 996 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 997 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 998 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 999 v += 36; jmin++; 1000 } 1001 for (j=jmin; j<n; j++) { 1002 /* (strict lower triangular part of A)*x */ 1003 cval = ib[j]*6; 1004 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 1005 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 1006 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 1007 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 1008 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 1009 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 1010 /* (strict upper triangular part of A)*x */ 1011 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 1012 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 1013 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 1014 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 1015 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 1016 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 1017 v += 36; 1018 } 1019 xb +=6; ai++; 1020 } 1021 1022 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1023 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1024 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1025 1026 PetscLogFlops(72*(a->nz*2 - A->m)); 1027 PetscFunctionReturn(0); 1028 } 1029 1030 #undef __FUNCT__ 1031 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 1032 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1033 { 1034 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1035 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 1036 MatScalar *v; 1037 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 1038 1039 PetscFunctionBegin; 1040 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1041 if (yy != xx) { 1042 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1043 } else { 1044 y = x; 1045 } 1046 if (zz != yy) { 1047 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1048 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1049 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1050 } else { 1051 z = y; 1052 } 1053 1054 v = a->a; 1055 xb = x; 1056 1057 for (i=0; i<mbs; i++) { 1058 n = ai[1] - ai[0]; /* length of i_th block row of A */ 1059 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 1060 ib = aj + *ai; 1061 jmin = 0; 1062 if (*ib == i){ /* (diag of A)*x */ 1063 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 1064 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 1065 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 1066 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 1067 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 1068 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 1069 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1070 v += 49; jmin++; 1071 } 1072 for (j=jmin; j<n; j++) { 1073 /* (strict lower triangular part of A)*x */ 1074 cval = ib[j]*7; 1075 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1076 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1077 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1078 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1079 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1080 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1081 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1082 /* (strict upper triangular part of A)*x */ 1083 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1084 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1085 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1086 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1087 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1088 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1089 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1090 v += 49; 1091 } 1092 xb +=7; ai++; 1093 } 1094 1095 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1096 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1097 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1098 1099 PetscLogFlops(98*(a->nz*2 - A->m)); 1100 PetscFunctionReturn(0); 1101 } 1102 1103 #undef __FUNCT__ 1104 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1105 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1106 { 1107 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1108 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1109 MatScalar *v; 1110 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1111 int ncols,k; 1112 1113 PetscFunctionBegin; 1114 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x; 1115 if (yy != xx) { 1116 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1117 } else { 1118 y = x; 1119 } 1120 if (zz != yy) { 1121 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1122 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z; 1123 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1124 } else { 1125 z = y; 1126 } 1127 1128 aj = a->j; 1129 v = a->a; 1130 ii = a->i; 1131 1132 if (!a->mult_work) { 1133 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1134 } 1135 work = a->mult_work; 1136 1137 1138 for (i=0; i<mbs; i++) { 1139 n = ii[1] - ii[0]; ncols = n*bs; 1140 workt = work; idx=aj+ii[0]; 1141 1142 /* upper triangular part */ 1143 for (j=0; j<n; j++) { 1144 xb = x_ptr + bs*(*idx++); 1145 for (k=0; k<bs; k++) workt[k] = xb[k]; 1146 workt += bs; 1147 } 1148 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1149 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1150 1151 /* strict lower triangular part */ 1152 idx = aj+ii[0]; 1153 if (*idx == i){ 1154 ncols -= bs; v += bs2; idx++; n--; 1155 } 1156 if (ncols > 0){ 1157 workt = work; 1158 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1159 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1160 for (j=0; j<n; j++) { 1161 zb = z_ptr + bs*(*idx++); 1162 /* idx++; */ 1163 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1164 workt += bs; 1165 } 1166 } 1167 1168 x += bs; v += n*bs2; z += bs; ii++; 1169 } 1170 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1173 if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1174 1175 PetscLogFlops(2*(a->nz*2 - A->m)); 1176 PetscFunctionReturn(0); 1177 } 1178 1179 #undef __FUNCT__ 1180 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1181 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1182 { 1183 int ierr; 1184 1185 PetscFunctionBegin; 1186 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1187 PetscFunctionReturn(0); 1188 } 1189 1190 #undef __FUNCT__ 1191 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1192 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1193 1194 { 1195 int ierr; 1196 1197 PetscFunctionBegin; 1198 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 #undef __FUNCT__ 1203 #define __FUNCT__ "MatScale_SeqSBAIJ" 1204 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1205 { 1206 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1207 int one = 1,totalnz = a->bs2*a->nz; 1208 1209 PetscFunctionBegin; 1210 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1211 PetscLogFlops(totalnz); 1212 PetscFunctionReturn(0); 1213 } 1214 1215 #undef __FUNCT__ 1216 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1217 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1218 { 1219 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1220 MatScalar *v = a->a; 1221 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1222 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1223 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1224 1225 PetscFunctionBegin; 1226 if (type == NORM_FROBENIUS) { 1227 for (k=0; k<mbs; k++){ 1228 jmin = a->i[k]; jmax = a->i[k+1]; 1229 col = aj + jmin; 1230 if (*col == k){ /* diagonal block */ 1231 for (i=0; i<bs2; i++){ 1232 #if defined(PETSC_USE_COMPLEX) 1233 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1234 #else 1235 sum_diag += (*v)*(*v); v++; 1236 #endif 1237 } 1238 jmin++; 1239 } 1240 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1241 for (i=0; i<bs2; i++){ 1242 #if defined(PETSC_USE_COMPLEX) 1243 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1244 #else 1245 sum_off += (*v)*(*v); v++; 1246 #endif 1247 } 1248 } 1249 } 1250 *norm = sqrt(sum_diag + 2*sum_off); 1251 1252 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1253 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1254 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1255 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1256 for (i=0; i<mbs; i++) { 1257 jl[i] = mbs; il[0] = 0; 1258 } 1259 1260 *norm = 0.0; 1261 for (k=0; k<mbs; k++) { /* k_th block row */ 1262 for (j=0; j<bs; j++) sum[j]=0.0; 1263 1264 /*-- col sum --*/ 1265 i = jl[k]; /* first |A(i,k)| to be added */ 1266 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1267 at step k */ 1268 while (i<mbs){ 1269 nexti = jl[i]; /* next block row to be added */ 1270 ik = il[i]; /* block index of A(i,k) in the array a */ 1271 for (j=0; j<bs; j++){ 1272 v = a->a + ik*bs2 + j*bs; 1273 for (k1=0; k1<bs; k1++) { 1274 sum[j] += PetscAbsScalar(*v); v++; 1275 } 1276 } 1277 /* update il, jl */ 1278 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1279 jmax = a->i[i+1]; 1280 if (jmin < jmax){ 1281 il[i] = jmin; 1282 j = a->j[jmin]; 1283 jl[i] = jl[j]; jl[j]=i; 1284 } 1285 i = nexti; 1286 } 1287 1288 /*-- row sum --*/ 1289 jmin = a->i[k]; jmax = a->i[k+1]; 1290 for (i=jmin; i<jmax; i++) { 1291 for (j=0; j<bs; j++){ 1292 v = a->a + i*bs2 + j; 1293 for (k1=0; k1<bs; k1++){ 1294 sum[j] += PetscAbsScalar(*v); 1295 v += bs; 1296 } 1297 } 1298 } 1299 /* add k_th block row to il, jl */ 1300 col = aj+jmin; 1301 if (*col == k) jmin++; 1302 if (jmin < jmax){ 1303 il[k] = jmin; 1304 j = a->j[jmin]; 1305 jl[k] = jl[j]; jl[j] = k; 1306 } 1307 for (j=0; j<bs; j++){ 1308 if (sum[j] > *norm) *norm = sum[j]; 1309 } 1310 } 1311 ierr = PetscFree(il);CHKERRQ(ierr); 1312 ierr = PetscFree(jl);CHKERRQ(ierr); 1313 ierr = PetscFree(sum);CHKERRQ(ierr); 1314 } else { 1315 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1316 } 1317 PetscFunctionReturn(0); 1318 } 1319 1320 #undef __FUNCT__ 1321 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1322 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1323 { 1324 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1325 int ierr; 1326 1327 PetscFunctionBegin; 1328 1329 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1330 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1331 *flg = PETSC_FALSE; 1332 PetscFunctionReturn(0); 1333 } 1334 1335 /* if the a->i are the same */ 1336 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1337 if (*flg == PETSC_FALSE) { 1338 PetscFunctionReturn(0); 1339 } 1340 1341 /* if a->j are the same */ 1342 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1343 if (*flg == PETSC_FALSE) { 1344 PetscFunctionReturn(0); 1345 } 1346 /* if a->a are the same */ 1347 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1348 1349 PetscFunctionReturn(0); 1350 } 1351 1352 #undef __FUNCT__ 1353 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1354 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1355 { 1356 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1357 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1358 PetscScalar *x,zero = 0.0; 1359 MatScalar *aa,*aa_j; 1360 1361 PetscFunctionBegin; 1362 bs = a->bs; 1363 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1364 1365 aa = a->a; 1366 ai = a->i; 1367 aj = a->j; 1368 ambs = a->mbs; 1369 bs2 = a->bs2; 1370 1371 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1372 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1373 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1374 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1375 for (i=0; i<ambs; i++) { 1376 j=ai[i]; 1377 if (aj[j] == i) { /* if this is a diagonal element */ 1378 row = i*bs; 1379 aa_j = aa + j*bs2; 1380 if (A->factor && bs==1){ 1381 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1382 } else { 1383 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1384 } 1385 } 1386 } 1387 1388 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 #undef __FUNCT__ 1393 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1394 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1395 { 1396 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1397 PetscScalar *l,*r,x,*li,*ri; 1398 MatScalar *aa,*v; 1399 int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1400 1401 PetscFunctionBegin; 1402 ai = a->i; 1403 aj = a->j; 1404 aa = a->a; 1405 m = A->m; 1406 bs = a->bs; 1407 mbs = a->mbs; 1408 bs2 = a->bs2; 1409 1410 if (ll != rr) { 1411 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1412 } 1413 if (ll) { 1414 ierr = VecGetArray(ll,&l);CHKERRQ(ierr); 1415 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1416 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1417 for (i=0; i<mbs; i++) { /* for each block row */ 1418 M = ai[i+1] - ai[i]; 1419 li = l + i*bs; 1420 v = aa + bs2*ai[i]; 1421 for (j=0; j<M; j++) { /* for each block */ 1422 for (k=0; k<bs2; k++) { 1423 (*v++) *= li[k%bs]; 1424 } 1425 #ifdef CONT 1426 /* will be used to replace the above loop */ 1427 ri = l + bs*aj[ai[i]+j]; 1428 for (k=0; k<bs; k++) { /* column value */ 1429 x = ri[k]; 1430 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1431 } 1432 #endif 1433 1434 } 1435 } 1436 ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr); 1437 PetscLogFlops(2*a->nz); 1438 } 1439 /* will be deleted */ 1440 if (rr) { 1441 ierr = VecGetArray(rr,&r);CHKERRQ(ierr); 1442 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1443 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1444 for (i=0; i<mbs; i++) { /* for each block row */ 1445 M = ai[i+1] - ai[i]; 1446 v = aa + bs2*ai[i]; 1447 for (j=0; j<M; j++) { /* for each block */ 1448 ri = r + bs*aj[ai[i]+j]; 1449 for (k=0; k<bs; k++) { 1450 x = ri[k]; 1451 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1452 } 1453 } 1454 } 1455 ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr); 1456 PetscLogFlops(a->nz); 1457 } 1458 PetscFunctionReturn(0); 1459 } 1460 1461 #undef __FUNCT__ 1462 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1463 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1464 { 1465 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1466 1467 PetscFunctionBegin; 1468 info->rows_global = (double)A->m; 1469 info->columns_global = (double)A->m; 1470 info->rows_local = (double)A->m; 1471 info->columns_local = (double)A->m; 1472 info->block_size = a->bs2; 1473 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1474 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1475 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1476 info->assemblies = A->num_ass; 1477 info->mallocs = a->reallocs; 1478 info->memory = A->mem; 1479 if (A->factor) { 1480 info->fill_ratio_given = A->info.fill_ratio_given; 1481 info->fill_ratio_needed = A->info.fill_ratio_needed; 1482 info->factor_mallocs = A->info.factor_mallocs; 1483 } else { 1484 info->fill_ratio_given = 0; 1485 info->fill_ratio_needed = 0; 1486 info->factor_mallocs = 0; 1487 } 1488 PetscFunctionReturn(0); 1489 } 1490 1491 1492 #undef __FUNCT__ 1493 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1494 int MatZeroEntries_SeqSBAIJ(Mat A) 1495 { 1496 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1497 int ierr; 1498 1499 PetscFunctionBegin; 1500 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1501 PetscFunctionReturn(0); 1502 } 1503 1504 #undef __FUNCT__ 1505 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1506 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1507 { 1508 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1509 int ierr,i,j,n,row,col,bs,*ai,*aj,mbs; 1510 PetscReal atmp; 1511 MatScalar *aa; 1512 PetscScalar zero = 0.0,*x; 1513 int ncols,brow,bcol,krow,kcol; 1514 1515 PetscFunctionBegin; 1516 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1517 bs = a->bs; 1518 aa = a->a; 1519 ai = a->i; 1520 aj = a->j; 1521 mbs = a->mbs; 1522 1523 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1524 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 1525 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1526 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1527 for (i=0; i<mbs; i++) { 1528 ncols = ai[1] - ai[0]; ai++; 1529 brow = bs*i; 1530 for (j=0; j<ncols; j++){ 1531 bcol = bs*(*aj); 1532 for (kcol=0; kcol<bs; kcol++){ 1533 col = bcol + kcol; /* col index */ 1534 for (krow=0; krow<bs; krow++){ 1535 atmp = PetscAbsScalar(*aa); aa++; 1536 row = brow + krow; /* row index */ 1537 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1538 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1539 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1540 } 1541 } 1542 aj++; 1543 } 1544 } 1545 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 1546 PetscFunctionReturn(0); 1547 } 1548