1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 12 { 13 PetscFunctionBegin; 14 SETERRQ(1,"Function not yet written for SBAIJ format"); 15 /* PetscFunctionReturn(0); */ 16 } 17 18 #undef __FUNCT__ 19 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 20 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 21 { 22 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 23 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 24 int row,mat_i,*mat_j,tcol,*mat_ilen; 25 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 26 int *aj = a->j,*ai = a->i; 27 MatScalar *mat_a; 28 Mat C; 29 PetscTruth flag; 30 31 PetscFunctionBegin; 32 33 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 34 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 35 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 36 37 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 38 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 39 40 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 41 ssmap = smap; 42 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 43 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 44 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 45 /* determine lens of each row */ 46 for (i=0; i<nrows; i++) { 47 kstart = ai[irow[i]]; 48 kend = kstart + a->ilen[irow[i]]; 49 lens[i] = 0; 50 for (k=kstart; k<kend; k++) { 51 if (ssmap[aj[k]]) { 52 lens[i]++; 53 } 54 } 55 } 56 /* Create and fill new matrix */ 57 if (scall == MAT_REUSE_MATRIX) { 58 c = (Mat_SeqSBAIJ *)((*B)->data); 59 60 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 61 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 62 if (flag == PETSC_FALSE) { 63 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 64 } 65 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 66 C = *B; 67 } else { 68 ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr); 69 } 70 c = (Mat_SeqSBAIJ *)(C->data); 71 for (i=0; i<nrows; i++) { 72 row = irow[i]; 73 kstart = ai[row]; 74 kend = kstart + a->ilen[row]; 75 mat_i = c->i[i]; 76 mat_j = c->j + mat_i; 77 mat_a = c->a + mat_i*bs2; 78 mat_ilen = c->ilen + i; 79 for (k=kstart; k<kend; k++) { 80 if ((tcol=ssmap[a->j[k]])) { 81 *mat_j++ = tcol - 1; 82 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 83 mat_a += bs2; 84 (*mat_ilen)++; 85 } 86 } 87 } 88 89 /* Free work space */ 90 ierr = PetscFree(smap);CHKERRQ(ierr); 91 ierr = PetscFree(lens);CHKERRQ(ierr); 92 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 93 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 94 95 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 96 *B = C; 97 PetscFunctionReturn(0); 98 } 99 100 #undef __FUNCT__ 101 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 102 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 103 { 104 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 105 IS is1; 106 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 107 108 PetscFunctionBegin; 109 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 110 111 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 112 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 113 114 /* Verify if the indices corespond to each element in a block 115 and form the IS with compressed IS */ 116 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 117 iary = vary + a->mbs; 118 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 119 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 120 121 count = 0; 122 for (i=0; i<a->mbs; i++) { 123 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 124 if (vary[i]==bs) iary[count++] = i; 125 } 126 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 127 128 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 129 ierr = PetscFree(vary);CHKERRQ(ierr); 130 131 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 132 ISDestroy(is1); 133 PetscFunctionReturn(0); 134 } 135 136 #undef __FUNCT__ 137 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 138 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 139 { 140 int ierr,i; 141 142 PetscFunctionBegin; 143 if (scall == MAT_INITIAL_MATRIX) { 144 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 145 } 146 147 for (i=0; i<n; i++) { 148 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 149 } 150 PetscFunctionReturn(0); 151 } 152 153 /* -------------------------------------------------------*/ 154 /* Should check that shapes of vectors and matrices match */ 155 /* -------------------------------------------------------*/ 156 #include "petscblaslapack.h" 157 158 #undef __FUNCT__ 159 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 160 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 161 { 162 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 163 PetscScalar *x,*z,*xb,x1,zero=0.0; 164 MatScalar *v; 165 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 166 167 PetscFunctionBegin; 168 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 169 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 170 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 171 172 v = a->a; 173 xb = x; 174 175 for (i=0; i<mbs; i++) { 176 n = ai[1] - ai[0]; /* length of i_th row of A */ 177 x1 = xb[0]; 178 ib = aj + *ai; 179 jmin = 0; 180 if (*ib == i) { /* (diag of A)*x */ 181 z[i] += *v++ * x[*ib++]; 182 jmin++; 183 } 184 for (j=jmin; j<n; j++) { 185 cval = *ib; 186 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 187 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 188 } 189 xb++; ai++; 190 } 191 192 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 193 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 194 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 200 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 201 { 202 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 203 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 204 MatScalar *v; 205 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 206 207 208 PetscFunctionBegin; 209 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 210 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 211 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 212 213 v = a->a; 214 xb = x; 215 216 for (i=0; i<mbs; i++) { 217 n = ai[1] - ai[0]; /* length of i_th block row of A */ 218 x1 = xb[0]; x2 = xb[1]; 219 ib = aj + *ai; 220 jmin = 0; 221 if (*ib == i){ /* (diag of A)*x */ 222 z[2*i] += v[0]*x1 + v[2]*x2; 223 z[2*i+1] += v[2]*x1 + v[3]*x2; 224 v += 4; jmin++; 225 } 226 for (j=jmin; j<n; j++) { 227 /* (strict lower triangular part of A)*x */ 228 cval = ib[j]*2; 229 z[cval] += v[0]*x1 + v[1]*x2; 230 z[cval+1] += v[2]*x1 + v[3]*x2; 231 /* (strict upper triangular part of A)*x */ 232 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 233 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 234 v += 4; 235 } 236 xb +=2; ai++; 237 } 238 239 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 240 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 241 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 242 PetscFunctionReturn(0); 243 } 244 245 #undef __FUNCT__ 246 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 247 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 248 { 249 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 250 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 251 MatScalar *v; 252 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 253 254 255 PetscFunctionBegin; 256 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 257 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 258 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 259 260 v = a->a; 261 xb = x; 262 263 for (i=0; i<mbs; i++) { 264 n = ai[1] - ai[0]; /* length of i_th block row of A */ 265 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 266 ib = aj + *ai; 267 jmin = 0; 268 if (*ib == i){ /* (diag of A)*x */ 269 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 270 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 271 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 272 v += 9; jmin++; 273 } 274 for (j=jmin; j<n; j++) { 275 /* (strict lower triangular part of A)*x */ 276 cval = ib[j]*3; 277 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 278 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 279 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 280 /* (strict upper triangular part of A)*x */ 281 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 282 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 283 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 284 v += 9; 285 } 286 xb +=3; ai++; 287 } 288 289 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 290 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 291 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 292 PetscFunctionReturn(0); 293 } 294 295 #undef __FUNCT__ 296 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 297 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 298 { 299 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 300 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 301 MatScalar *v; 302 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 303 304 PetscFunctionBegin; 305 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 306 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 307 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 308 309 v = a->a; 310 xb = x; 311 312 for (i=0; i<mbs; i++) { 313 n = ai[1] - ai[0]; /* length of i_th block row of A */ 314 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 315 ib = aj + *ai; 316 jmin = 0; 317 if (*ib == i){ /* (diag of A)*x */ 318 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 319 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 320 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 321 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 322 v += 16; jmin++; 323 } 324 for (j=jmin; j<n; j++) { 325 /* (strict lower triangular part of A)*x */ 326 cval = ib[j]*4; 327 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 328 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 329 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 330 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 331 /* (strict upper triangular part of A)*x */ 332 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 333 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 334 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 335 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 336 v += 16; 337 } 338 xb +=4; ai++; 339 } 340 341 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 342 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 343 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 344 PetscFunctionReturn(0); 345 } 346 347 #undef __FUNCT__ 348 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 349 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 350 { 351 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 352 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 353 MatScalar *v; 354 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 355 356 PetscFunctionBegin; 357 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 358 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 359 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 360 361 v = a->a; 362 xb = x; 363 364 for (i=0; i<mbs; i++) { 365 n = ai[1] - ai[0]; /* length of i_th block row of A */ 366 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 367 ib = aj + *ai; 368 jmin = 0; 369 if (*ib == i){ /* (diag of A)*x */ 370 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 371 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 372 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 373 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 374 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 375 v += 25; jmin++; 376 } 377 for (j=jmin; j<n; j++) { 378 /* (strict lower triangular part of A)*x */ 379 cval = ib[j]*5; 380 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 381 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 382 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 383 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 384 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 385 /* (strict upper triangular part of A)*x */ 386 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 387 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 388 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 389 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 390 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 391 v += 25; 392 } 393 xb +=5; ai++; 394 } 395 396 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 397 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 398 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 399 PetscFunctionReturn(0); 400 } 401 402 403 #undef __FUNCT__ 404 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 405 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 406 { 407 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 408 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 409 MatScalar *v; 410 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 411 412 PetscFunctionBegin; 413 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 414 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 415 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 416 417 v = a->a; 418 xb = x; 419 420 for (i=0; i<mbs; i++) { 421 n = ai[1] - ai[0]; /* length of i_th block row of A */ 422 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 423 ib = aj + *ai; 424 jmin = 0; 425 if (*ib == i){ /* (diag of A)*x */ 426 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 427 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 428 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 429 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 430 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 431 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 432 v += 36; jmin++; 433 } 434 for (j=jmin; j<n; j++) { 435 /* (strict lower triangular part of A)*x */ 436 cval = ib[j]*6; 437 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 438 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 439 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 440 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 441 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 442 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 443 /* (strict upper triangular part of A)*x */ 444 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 445 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 446 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 447 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 448 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 449 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 450 v += 36; 451 } 452 xb +=6; ai++; 453 } 454 455 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 456 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 457 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 458 PetscFunctionReturn(0); 459 } 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 462 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 463 { 464 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 465 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 466 MatScalar *v; 467 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 468 469 PetscFunctionBegin; 470 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 471 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 472 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 473 474 v = a->a; 475 xb = x; 476 477 for (i=0; i<mbs; i++) { 478 n = ai[1] - ai[0]; /* length of i_th block row of A */ 479 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 480 ib = aj + *ai; 481 jmin = 0; 482 if (*ib == i){ /* (diag of A)*x */ 483 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 484 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 485 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 486 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 487 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 488 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 489 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 490 v += 49; jmin++; 491 } 492 for (j=jmin; j<n; j++) { 493 /* (strict lower triangular part of A)*x */ 494 cval = ib[j]*7; 495 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 496 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 497 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 498 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 499 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 500 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 501 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 502 /* (strict upper triangular part of A)*x */ 503 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 504 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 505 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 506 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 507 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 508 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 509 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 510 v += 49; 511 } 512 xb +=7; ai++; 513 } 514 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 515 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 516 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 517 PetscFunctionReturn(0); 518 } 519 520 /* 521 This will not work with MatScalar == float because it calls the BLAS 522 */ 523 #undef __FUNCT__ 524 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 525 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 526 { 527 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 528 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 529 MatScalar *v; 530 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 531 int ncols,k; 532 533 PetscFunctionBegin; 534 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 535 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 536 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 537 538 aj = a->j; 539 v = a->a; 540 ii = a->i; 541 542 if (!a->mult_work) { 543 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 544 } 545 work = a->mult_work; 546 547 for (i=0; i<mbs; i++) { 548 n = ii[1] - ii[0]; ncols = n*bs; 549 workt = work; idx=aj+ii[0]; 550 551 /* upper triangular part */ 552 for (j=0; j<n; j++) { 553 xb = x_ptr + bs*(*idx++); 554 for (k=0; k<bs; k++) workt[k] = xb[k]; 555 workt += bs; 556 } 557 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 558 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 559 560 /* strict lower triangular part */ 561 idx = aj+ii[0]; 562 if (*idx == i){ 563 ncols -= bs; v += bs2; idx++; n--; 564 } 565 566 if (ncols > 0){ 567 workt = work; 568 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 569 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 570 for (j=0; j<n; j++) { 571 zb = z_ptr + bs*(*idx++); 572 for (k=0; k<bs; k++) zb[k] += workt[k] ; 573 workt += bs; 574 } 575 } 576 x += bs; v += n*bs2; z += bs; ii++; 577 } 578 579 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 580 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 581 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 582 PetscFunctionReturn(0); 583 } 584 585 #undef __FUNCT__ 586 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 587 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 588 { 589 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 590 PetscScalar *x,*y,*z,*xb,x1; 591 MatScalar *v; 592 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 593 594 PetscFunctionBegin; 595 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 596 if (yy != xx) { 597 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 598 } else { 599 y = x; 600 } 601 if (zz != yy) { 602 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 603 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 604 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 609 v = a->a; 610 xb = x; 611 612 for (i=0; i<mbs; i++) { 613 n = ai[1] - ai[0]; /* length of i_th row of A */ 614 x1 = xb[0]; 615 ib = aj + *ai; 616 jmin = 0; 617 if (*ib == i) { /* (diag of A)*x */ 618 z[i] += *v++ * x[*ib++]; jmin++; 619 } 620 for (j=jmin; j<n; j++) { 621 cval = *ib; 622 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 623 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 624 } 625 xb++; ai++; 626 } 627 628 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 629 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 630 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 631 632 PetscLogFlops(2*(a->nz*2 - A->m)); 633 PetscFunctionReturn(0); 634 } 635 636 #undef __FUNCT__ 637 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 638 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 639 { 640 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 641 PetscScalar *x,*y,*z,*xb,x1,x2; 642 MatScalar *v; 643 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 644 645 PetscFunctionBegin; 646 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 647 if (yy != xx) { 648 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 649 } else { 650 y = x; 651 } 652 if (zz != yy) { 653 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 654 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 655 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 656 } else { 657 z = y; 658 } 659 660 v = a->a; 661 xb = x; 662 663 for (i=0; i<mbs; i++) { 664 n = ai[1] - ai[0]; /* length of i_th block row of A */ 665 x1 = xb[0]; x2 = xb[1]; 666 ib = aj + *ai; 667 jmin = 0; 668 if (*ib == i){ /* (diag of A)*x */ 669 z[2*i] += v[0]*x1 + v[2]*x2; 670 z[2*i+1] += v[2]*x1 + v[3]*x2; 671 v += 4; jmin++; 672 } 673 for (j=jmin; j<n; j++) { 674 /* (strict lower triangular part of A)*x */ 675 cval = ib[j]*2; 676 z[cval] += v[0]*x1 + v[1]*x2; 677 z[cval+1] += v[2]*x1 + v[3]*x2; 678 /* (strict upper triangular part of A)*x */ 679 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 680 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 681 v += 4; 682 } 683 xb +=2; ai++; 684 } 685 686 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 687 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 688 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 689 690 PetscLogFlops(4*(a->nz*2 - A->m)); 691 PetscFunctionReturn(0); 692 } 693 694 #undef __FUNCT__ 695 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 696 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 697 { 698 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 699 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 700 MatScalar *v; 701 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 702 703 PetscFunctionBegin; 704 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 705 if (yy != xx) { 706 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 707 } else { 708 y = x; 709 } 710 if (zz != yy) { 711 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 712 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 713 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 714 } else { 715 z = y; 716 } 717 718 v = a->a; 719 xb = x; 720 721 for (i=0; i<mbs; i++) { 722 n = ai[1] - ai[0]; /* length of i_th block row of A */ 723 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 724 ib = aj + *ai; 725 jmin = 0; 726 if (*ib == i){ /* (diag of A)*x */ 727 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 728 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 729 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 730 v += 9; jmin++; 731 } 732 for (j=jmin; j<n; j++) { 733 /* (strict lower triangular part of A)*x */ 734 cval = ib[j]*3; 735 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 736 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 737 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 738 /* (strict upper triangular part of A)*x */ 739 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 740 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 741 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 742 v += 9; 743 } 744 xb +=3; ai++; 745 } 746 747 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 748 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 749 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 750 751 PetscLogFlops(18*(a->nz*2 - A->m)); 752 PetscFunctionReturn(0); 753 } 754 755 #undef __FUNCT__ 756 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 757 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 758 { 759 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 760 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 761 MatScalar *v; 762 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 763 764 PetscFunctionBegin; 765 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 766 if (yy != xx) { 767 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 768 } else { 769 y = x; 770 } 771 if (zz != yy) { 772 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 773 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 774 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 775 } else { 776 z = y; 777 } 778 779 v = a->a; 780 xb = x; 781 782 for (i=0; i<mbs; i++) { 783 n = ai[1] - ai[0]; /* length of i_th block row of A */ 784 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 785 ib = aj + *ai; 786 jmin = 0; 787 if (*ib == i){ /* (diag of A)*x */ 788 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 789 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 790 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 791 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 792 v += 16; jmin++; 793 } 794 for (j=jmin; j<n; j++) { 795 /* (strict lower triangular part of A)*x */ 796 cval = ib[j]*4; 797 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 798 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 799 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 800 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 801 /* (strict upper triangular part of A)*x */ 802 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 803 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 804 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 805 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 806 v += 16; 807 } 808 xb +=4; ai++; 809 } 810 811 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 812 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 813 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 814 815 PetscLogFlops(32*(a->nz*2 - A->m)); 816 PetscFunctionReturn(0); 817 } 818 819 #undef __FUNCT__ 820 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 821 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 822 { 823 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 824 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 825 MatScalar *v; 826 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 827 828 PetscFunctionBegin; 829 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 830 if (yy != xx) { 831 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 832 } else { 833 y = x; 834 } 835 if (zz != yy) { 836 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 837 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 838 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 839 } else { 840 z = y; 841 } 842 843 v = a->a; 844 xb = x; 845 846 for (i=0; i<mbs; i++) { 847 n = ai[1] - ai[0]; /* length of i_th block row of A */ 848 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 849 ib = aj + *ai; 850 jmin = 0; 851 if (*ib == i){ /* (diag of A)*x */ 852 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 853 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 854 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 855 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 856 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 857 v += 25; jmin++; 858 } 859 for (j=jmin; j<n; j++) { 860 /* (strict lower triangular part of A)*x */ 861 cval = ib[j]*5; 862 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 863 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 864 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 865 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 866 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 867 /* (strict upper triangular part of A)*x */ 868 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 869 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 870 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 871 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 872 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 873 v += 25; 874 } 875 xb +=5; ai++; 876 } 877 878 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 879 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 880 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 881 882 PetscLogFlops(50*(a->nz*2 - A->m)); 883 PetscFunctionReturn(0); 884 } 885 #undef __FUNCT__ 886 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 887 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 888 { 889 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 890 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 891 MatScalar *v; 892 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 893 894 PetscFunctionBegin; 895 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 896 if (yy != xx) { 897 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 898 } else { 899 y = x; 900 } 901 if (zz != yy) { 902 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 903 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 904 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 905 } else { 906 z = y; 907 } 908 909 v = a->a; 910 xb = x; 911 912 for (i=0; i<mbs; i++) { 913 n = ai[1] - ai[0]; /* length of i_th block row of A */ 914 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 915 ib = aj + *ai; 916 jmin = 0; 917 if (*ib == i){ /* (diag of A)*x */ 918 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 919 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 920 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 921 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 922 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 923 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 924 v += 36; jmin++; 925 } 926 for (j=jmin; j<n; j++) { 927 /* (strict lower triangular part of A)*x */ 928 cval = ib[j]*6; 929 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 930 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 931 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 932 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 933 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 934 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 935 /* (strict upper triangular part of A)*x */ 936 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 937 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 938 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 939 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 940 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 941 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 942 v += 36; 943 } 944 xb +=6; ai++; 945 } 946 947 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 948 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 949 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 950 951 PetscLogFlops(72*(a->nz*2 - A->m)); 952 PetscFunctionReturn(0); 953 } 954 955 #undef __FUNCT__ 956 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 957 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 958 { 959 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 960 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 961 MatScalar *v; 962 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 963 964 PetscFunctionBegin; 965 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 966 if (yy != xx) { 967 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 968 } else { 969 y = x; 970 } 971 if (zz != yy) { 972 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 973 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 974 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 975 } else { 976 z = y; 977 } 978 979 v = a->a; 980 xb = x; 981 982 for (i=0; i<mbs; i++) { 983 n = ai[1] - ai[0]; /* length of i_th block row of A */ 984 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 985 ib = aj + *ai; 986 jmin = 0; 987 if (*ib == i){ /* (diag of A)*x */ 988 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 989 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 990 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 991 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 992 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 993 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 994 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 995 v += 49; jmin++; 996 } 997 for (j=jmin; j<n; j++) { 998 /* (strict lower triangular part of A)*x */ 999 cval = ib[j]*7; 1000 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1001 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1002 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1003 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1004 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1005 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1006 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1007 /* (strict upper triangular part of A)*x */ 1008 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1009 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1010 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1011 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1012 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1013 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1014 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1015 v += 49; 1016 } 1017 xb +=7; ai++; 1018 } 1019 1020 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1021 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1022 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1023 1024 PetscLogFlops(98*(a->nz*2 - A->m)); 1025 PetscFunctionReturn(0); 1026 } 1027 1028 #undef __FUNCT__ 1029 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1030 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1031 { 1032 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1033 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1034 MatScalar *v; 1035 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1036 int ncols,k; 1037 1038 PetscFunctionBegin; 1039 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 1040 if (yy != xx) { 1041 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1042 } else { 1043 y = x; 1044 } 1045 if (zz != yy) { 1046 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1047 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 1048 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1049 } else { 1050 z = y; 1051 } 1052 1053 aj = a->j; 1054 v = a->a; 1055 ii = a->i; 1056 1057 if (!a->mult_work) { 1058 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1059 } 1060 work = a->mult_work; 1061 1062 1063 for (i=0; i<mbs; i++) { 1064 n = ii[1] - ii[0]; ncols = n*bs; 1065 workt = work; idx=aj+ii[0]; 1066 1067 /* upper triangular part */ 1068 for (j=0; j<n; j++) { 1069 xb = x_ptr + bs*(*idx++); 1070 for (k=0; k<bs; k++) workt[k] = xb[k]; 1071 workt += bs; 1072 } 1073 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1074 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1075 1076 /* strict lower triangular part */ 1077 idx = aj+ii[0]; 1078 if (*idx == i){ 1079 ncols -= bs; v += bs2; idx++; n--; 1080 } 1081 if (ncols > 0){ 1082 workt = work; 1083 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1084 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1085 for (j=0; j<n; j++) { 1086 zb = z_ptr + bs*(*idx++); 1087 /* idx++; */ 1088 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1089 workt += bs; 1090 } 1091 } 1092 1093 x += bs; v += n*bs2; z += bs; ii++; 1094 } 1095 1096 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1097 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1098 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1099 1100 PetscLogFlops(2*(a->nz*2 - A->m)); 1101 PetscFunctionReturn(0); 1102 } 1103 1104 #undef __FUNCT__ 1105 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1106 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1107 { 1108 int ierr; 1109 1110 PetscFunctionBegin; 1111 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1112 PetscFunctionReturn(0); 1113 } 1114 1115 #undef __FUNCT__ 1116 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1117 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1118 1119 { 1120 int ierr; 1121 1122 PetscFunctionBegin; 1123 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1124 PetscFunctionReturn(0); 1125 } 1126 1127 #undef __FUNCT__ 1128 #define __FUNCT__ "MatScale_SeqSBAIJ" 1129 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1130 { 1131 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1132 int one = 1,totalnz = a->bs2*a->nz; 1133 1134 PetscFunctionBegin; 1135 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1136 PetscLogFlops(totalnz); 1137 PetscFunctionReturn(0); 1138 } 1139 1140 #undef __FUNCT__ 1141 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1142 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1143 { 1144 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1145 MatScalar *v = a->a; 1146 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1147 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1148 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1149 1150 PetscFunctionBegin; 1151 if (type == NORM_FROBENIUS) { 1152 for (k=0; k<mbs; k++){ 1153 jmin = a->i[k]; jmax = a->i[k+1]; 1154 col = aj + jmin; 1155 if (*col == k){ /* diagonal block */ 1156 for (i=0; i<bs2; i++){ 1157 #if defined(PETSC_USE_COMPLEX) 1158 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1159 #else 1160 sum_diag += (*v)*(*v); v++; 1161 #endif 1162 } 1163 jmin++; 1164 } 1165 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1166 for (i=0; i<bs2; i++){ 1167 #if defined(PETSC_USE_COMPLEX) 1168 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1169 #else 1170 sum_off += (*v)*(*v); v++; 1171 #endif 1172 } 1173 } 1174 } 1175 *norm = sqrt(sum_diag + 2*sum_off); 1176 1177 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1178 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1179 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1180 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1181 for (i=0; i<mbs; i++) { 1182 jl[i] = mbs; il[0] = 0; 1183 } 1184 1185 *norm = 0.0; 1186 for (k=0; k<mbs; k++) { /* k_th block row */ 1187 for (j=0; j<bs; j++) sum[j]=0.0; 1188 1189 /*-- col sum --*/ 1190 i = jl[k]; /* first |A(i,k)| to be added */ 1191 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1192 at step k */ 1193 while (i<mbs){ 1194 nexti = jl[i]; /* next block row to be added */ 1195 ik = il[i]; /* block index of A(i,k) in the array a */ 1196 for (j=0; j<bs; j++){ 1197 v = a->a + ik*bs2 + j*bs; 1198 for (k1=0; k1<bs; k1++) { 1199 sum[j] += PetscAbsScalar(*v); v++; 1200 } 1201 } 1202 /* update il, jl */ 1203 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1204 jmax = a->i[i+1]; 1205 if (jmin < jmax){ 1206 il[i] = jmin; 1207 j = a->j[jmin]; 1208 jl[i] = jl[j]; jl[j]=i; 1209 } 1210 i = nexti; 1211 } 1212 1213 /*-- row sum --*/ 1214 jmin = a->i[k]; jmax = a->i[k+1]; 1215 for (i=jmin; i<jmax; i++) { 1216 for (j=0; j<bs; j++){ 1217 v = a->a + i*bs2 + j; 1218 for (k1=0; k1<bs; k1++){ 1219 sum[j] += PetscAbsScalar(*v); 1220 v += bs; 1221 } 1222 } 1223 } 1224 /* add k_th block row to il, jl */ 1225 col = aj+jmin; 1226 if (*col == k) jmin++; 1227 if (jmin < jmax){ 1228 il[k] = jmin; 1229 j = a->j[jmin]; 1230 jl[k] = jl[j]; jl[j] = k; 1231 } 1232 for (j=0; j<bs; j++){ 1233 if (sum[j] > *norm) *norm = sum[j]; 1234 } 1235 } 1236 ierr = PetscFree(il);CHKERRQ(ierr); 1237 ierr = PetscFree(jl);CHKERRQ(ierr); 1238 ierr = PetscFree(sum);CHKERRQ(ierr); 1239 } else { 1240 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1241 } 1242 PetscFunctionReturn(0); 1243 } 1244 1245 #undef __FUNCT__ 1246 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1247 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1248 { 1249 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1250 int ierr; 1251 1252 PetscFunctionBegin; 1253 1254 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1255 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1256 *flg = PETSC_FALSE; 1257 PetscFunctionReturn(0); 1258 } 1259 1260 /* if the a->i are the same */ 1261 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1262 if (*flg == PETSC_FALSE) { 1263 PetscFunctionReturn(0); 1264 } 1265 1266 /* if a->j are the same */ 1267 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1268 if (*flg == PETSC_FALSE) { 1269 PetscFunctionReturn(0); 1270 } 1271 /* if a->a are the same */ 1272 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1273 1274 PetscFunctionReturn(0); 1275 } 1276 1277 #undef __FUNCT__ 1278 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1279 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1280 { 1281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1282 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1283 PetscScalar *x,zero = 0.0; 1284 MatScalar *aa,*aa_j; 1285 1286 PetscFunctionBegin; 1287 bs = a->bs; 1288 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1289 1290 aa = a->a; 1291 ai = a->i; 1292 aj = a->j; 1293 ambs = a->mbs; 1294 bs2 = a->bs2; 1295 1296 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1297 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1298 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1299 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1300 for (i=0; i<ambs; i++) { 1301 j=ai[i]; 1302 if (aj[j] == i) { /* if this is a diagonal element */ 1303 row = i*bs; 1304 aa_j = aa + j*bs2; 1305 if (A->factor && bs==1){ 1306 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1307 } else { 1308 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1309 } 1310 } 1311 } 1312 1313 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1314 PetscFunctionReturn(0); 1315 } 1316 1317 #undef __FUNCT__ 1318 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1319 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1320 { 1321 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1322 PetscScalar *l,*r,x,*li,*ri; 1323 MatScalar *aa,*v; 1324 int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1325 1326 PetscFunctionBegin; 1327 ai = a->i; 1328 aj = a->j; 1329 aa = a->a; 1330 m = A->m; 1331 bs = a->bs; 1332 mbs = a->mbs; 1333 bs2 = a->bs2; 1334 1335 if (ll != rr) { 1336 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1337 } 1338 if (ll) { 1339 ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr); 1340 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1341 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1342 for (i=0; i<mbs; i++) { /* for each block row */ 1343 M = ai[i+1] - ai[i]; 1344 li = l + i*bs; 1345 v = aa + bs2*ai[i]; 1346 for (j=0; j<M; j++) { /* for each block */ 1347 for (k=0; k<bs2; k++) { 1348 (*v++) *= li[k%bs]; 1349 } 1350 #ifdef CONT 1351 /* will be used to replace the above loop */ 1352 ri = l + bs*aj[ai[i]+j]; 1353 for (k=0; k<bs; k++) { /* column value */ 1354 x = ri[k]; 1355 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1356 } 1357 #endif 1358 1359 } 1360 } 1361 ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr); 1362 PetscLogFlops(2*a->nz); 1363 } 1364 /* will be deleted */ 1365 if (rr) { 1366 ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr); 1367 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1368 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1369 for (i=0; i<mbs; i++) { /* for each block row */ 1370 M = ai[i+1] - ai[i]; 1371 v = aa + bs2*ai[i]; 1372 for (j=0; j<M; j++) { /* for each block */ 1373 ri = r + bs*aj[ai[i]+j]; 1374 for (k=0; k<bs; k++) { 1375 x = ri[k]; 1376 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1377 } 1378 } 1379 } 1380 ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr); 1381 PetscLogFlops(a->nz); 1382 } 1383 PetscFunctionReturn(0); 1384 } 1385 1386 #undef __FUNCT__ 1387 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1388 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1389 { 1390 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1391 1392 PetscFunctionBegin; 1393 info->rows_global = (double)A->m; 1394 info->columns_global = (double)A->m; 1395 info->rows_local = (double)A->m; 1396 info->columns_local = (double)A->m; 1397 info->block_size = a->bs2; 1398 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1399 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1400 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1401 info->assemblies = A->num_ass; 1402 info->mallocs = a->reallocs; 1403 info->memory = A->mem; 1404 if (A->factor) { 1405 info->fill_ratio_given = A->info.fill_ratio_given; 1406 info->fill_ratio_needed = A->info.fill_ratio_needed; 1407 info->factor_mallocs = A->info.factor_mallocs; 1408 } else { 1409 info->fill_ratio_given = 0; 1410 info->fill_ratio_needed = 0; 1411 info->factor_mallocs = 0; 1412 } 1413 PetscFunctionReturn(0); 1414 } 1415 1416 1417 #undef __FUNCT__ 1418 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1419 int MatZeroEntries_SeqSBAIJ(Mat A) 1420 { 1421 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1422 int ierr; 1423 1424 PetscFunctionBegin; 1425 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1426 PetscFunctionReturn(0); 1427 } 1428 1429 #undef __FUNCT__ 1430 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1431 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1432 { 1433 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1434 int ierr,i,j,n,row,col,bs,*ai,*aj,mbs; 1435 PetscReal atmp; 1436 MatScalar *aa; 1437 PetscScalar zero = 0.0,*x; 1438 int ncols,brow,bcol,krow,kcol; 1439 1440 PetscFunctionBegin; 1441 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1442 bs = a->bs; 1443 aa = a->a; 1444 ai = a->i; 1445 aj = a->j; 1446 mbs = a->mbs; 1447 1448 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1449 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1450 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1451 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1452 for (i=0; i<mbs; i++) { 1453 ncols = ai[1] - ai[0]; ai++; 1454 brow = bs*i; 1455 for (j=0; j<ncols; j++){ 1456 bcol = bs*(*aj); 1457 for (kcol=0; kcol<bs; kcol++){ 1458 col = bcol + kcol; /* col index */ 1459 for (krow=0; krow<bs; krow++){ 1460 atmp = PetscAbsScalar(*aa); aa++; 1461 row = brow + krow; /* row index */ 1462 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1463 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1464 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1465 } 1466 } 1467 aj++; 1468 } 1469 } 1470 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1471 PetscFunctionReturn(0); 1472 } 1473