1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/ 2 3 #include "src/mat/impls/baij/seq/baij.h" 4 #include "src/inline/spops.h" 5 #include "src/inline/ilu.h" 6 #include "petscbt.h" 7 #include "src/mat/impls/sbaij/seq/sbaij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ" 11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov) 12 { 13 PetscFunctionBegin; 14 SETERRQ(1,"Function not yet written for SBAIJ format"); 15 /* PetscFunctionReturn(0); */ 16 } 17 18 #undef __FUNCT__ 19 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private" 20 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 21 { 22 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c; 23 int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens; 24 int row,mat_i,*mat_j,tcol,*mat_ilen; 25 int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2; 26 int *aj = a->j,*ai = a->i; 27 MatScalar *mat_a; 28 Mat C; 29 PetscTruth flag; 30 31 PetscFunctionBegin; 32 33 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 34 ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr); 35 if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 36 37 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 38 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 39 40 ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr); 41 ssmap = smap; 42 ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr); 43 ierr = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr); 44 for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */ 45 /* determine lens of each row */ 46 for (i=0; i<nrows; i++) { 47 kstart = ai[irow[i]]; 48 kend = kstart + a->ilen[irow[i]]; 49 lens[i] = 0; 50 for (k=kstart; k<kend; k++) { 51 if (ssmap[aj[k]]) { 52 lens[i]++; 53 } 54 } 55 } 56 /* Create and fill new matrix */ 57 if (scall == MAT_REUSE_MATRIX) { 58 c = (Mat_SeqSBAIJ *)((*B)->data); 59 60 if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 61 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr); 62 if (flag == PETSC_FALSE) { 63 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 64 } 65 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr); 66 C = *B; 67 } else { 68 ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr); 69 ierr = MatSetType(C,A->type_name);CHKERRQ(ierr); 70 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 71 } 72 c = (Mat_SeqSBAIJ *)(C->data); 73 for (i=0; i<nrows; i++) { 74 row = irow[i]; 75 kstart = ai[row]; 76 kend = kstart + a->ilen[row]; 77 mat_i = c->i[i]; 78 mat_j = c->j + mat_i; 79 mat_a = c->a + mat_i*bs2; 80 mat_ilen = c->ilen + i; 81 for (k=kstart; k<kend; k++) { 82 if ((tcol=ssmap[a->j[k]])) { 83 *mat_j++ = tcol - 1; 84 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 85 mat_a += bs2; 86 (*mat_ilen)++; 87 } 88 } 89 } 90 91 /* Free work space */ 92 ierr = PetscFree(smap);CHKERRQ(ierr); 93 ierr = PetscFree(lens);CHKERRQ(ierr); 94 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 95 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 96 97 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 98 *B = C; 99 PetscFunctionReturn(0); 100 } 101 102 #undef __FUNCT__ 103 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ" 104 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B) 105 { 106 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 107 IS is1; 108 int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count; 109 110 PetscFunctionBegin; 111 if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro"); 112 113 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 114 ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr); 115 116 /* Verify if the indices corespond to each element in a block 117 and form the IS with compressed IS */ 118 ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr); 119 iary = vary + a->mbs; 120 ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr); 121 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 122 123 count = 0; 124 for (i=0; i<a->mbs; i++) { 125 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks"); 126 if (vary[i]==bs) iary[count++] = i; 127 } 128 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr); 129 130 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 131 ierr = PetscFree(vary);CHKERRQ(ierr); 132 133 ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr); 134 ISDestroy(is1); 135 PetscFunctionReturn(0); 136 } 137 138 #undef __FUNCT__ 139 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ" 140 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 141 { 142 int ierr,i; 143 144 PetscFunctionBegin; 145 if (scall == MAT_INITIAL_MATRIX) { 146 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 147 } 148 149 for (i=0; i<n; i++) { 150 ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr); 151 } 152 PetscFunctionReturn(0); 153 } 154 155 /* -------------------------------------------------------*/ 156 /* Should check that shapes of vectors and matrices match */ 157 /* -------------------------------------------------------*/ 158 #include "petscblaslapack.h" 159 160 #undef __FUNCT__ 161 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 162 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 163 { 164 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 165 PetscScalar *x,*z,*xb,x1,zero=0.0; 166 MatScalar *v; 167 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 168 169 PetscFunctionBegin; 170 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 171 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 172 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 173 174 v = a->a; 175 xb = x; 176 177 for (i=0; i<mbs; i++) { 178 n = ai[1] - ai[0]; /* length of i_th row of A */ 179 x1 = xb[0]; 180 ib = aj + *ai; 181 jmin = 0; 182 if (*ib == i) { /* (diag of A)*x */ 183 z[i] += *v++ * x[*ib++]; 184 jmin++; 185 } 186 for (j=jmin; j<n; j++) { 187 cval = *ib; 188 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 189 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 190 } 191 xb++; ai++; 192 } 193 194 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 195 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 196 PetscLogFlops(2*(a->nz*2 - A->m) - A->m); /* nz = (nz+m)/2 */ 197 PetscFunctionReturn(0); 198 } 199 200 #undef __FUNCT__ 201 #define __FUNCT__ "MatMult_SeqSBAIJ_2" 202 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz) 203 { 204 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 205 PetscScalar *x,*z,*xb,x1,x2,zero=0.0; 206 MatScalar *v; 207 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 208 209 210 PetscFunctionBegin; 211 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 212 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 213 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 214 215 v = a->a; 216 xb = x; 217 218 for (i=0; i<mbs; i++) { 219 n = ai[1] - ai[0]; /* length of i_th block row of A */ 220 x1 = xb[0]; x2 = xb[1]; 221 ib = aj + *ai; 222 jmin = 0; 223 if (*ib == i){ /* (diag of A)*x */ 224 z[2*i] += v[0]*x1 + v[2]*x2; 225 z[2*i+1] += v[2]*x1 + v[3]*x2; 226 v += 4; jmin++; 227 } 228 for (j=jmin; j<n; j++) { 229 /* (strict lower triangular part of A)*x */ 230 cval = ib[j]*2; 231 z[cval] += v[0]*x1 + v[1]*x2; 232 z[cval+1] += v[2]*x1 + v[3]*x2; 233 /* (strict upper triangular part of A)*x */ 234 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 235 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 236 v += 4; 237 } 238 xb +=2; ai++; 239 } 240 241 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 242 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 243 PetscLogFlops(8*(a->nz*2 - A->m) - A->m); 244 PetscFunctionReturn(0); 245 } 246 247 #undef __FUNCT__ 248 #define __FUNCT__ "MatMult_SeqSBAIJ_3" 249 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz) 250 { 251 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 252 PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0; 253 MatScalar *v; 254 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 255 256 257 PetscFunctionBegin; 258 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 259 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 260 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 261 262 v = a->a; 263 xb = x; 264 265 for (i=0; i<mbs; i++) { 266 n = ai[1] - ai[0]; /* length of i_th block row of A */ 267 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 268 ib = aj + *ai; 269 jmin = 0; 270 if (*ib == i){ /* (diag of A)*x */ 271 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 272 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 273 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 274 v += 9; jmin++; 275 } 276 for (j=jmin; j<n; j++) { 277 /* (strict lower triangular part of A)*x */ 278 cval = ib[j]*3; 279 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 280 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 281 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 282 /* (strict upper triangular part of A)*x */ 283 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 284 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 285 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 286 v += 9; 287 } 288 xb +=3; ai++; 289 } 290 291 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 292 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 293 PetscLogFlops(18*(a->nz*2 - A->m) - A->m); 294 PetscFunctionReturn(0); 295 } 296 297 #undef __FUNCT__ 298 #define __FUNCT__ "MatMult_SeqSBAIJ_4" 299 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz) 300 { 301 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 302 PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0; 303 MatScalar *v; 304 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 305 306 PetscFunctionBegin; 307 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 308 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 309 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 310 311 v = a->a; 312 xb = x; 313 314 for (i=0; i<mbs; i++) { 315 n = ai[1] - ai[0]; /* length of i_th block row of A */ 316 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 317 ib = aj + *ai; 318 jmin = 0; 319 if (*ib == i){ /* (diag of A)*x */ 320 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 321 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 322 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 323 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 324 v += 16; jmin++; 325 } 326 for (j=jmin; j<n; j++) { 327 /* (strict lower triangular part of A)*x */ 328 cval = ib[j]*4; 329 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 330 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 331 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 332 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 333 /* (strict upper triangular part of A)*x */ 334 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 335 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 336 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 337 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 338 v += 16; 339 } 340 xb +=4; ai++; 341 } 342 343 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 344 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 345 PetscLogFlops(32*(a->nz*2 - A->m) - A->m); 346 PetscFunctionReturn(0); 347 } 348 349 #undef __FUNCT__ 350 #define __FUNCT__ "MatMult_SeqSBAIJ_5" 351 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz) 352 { 353 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 354 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0; 355 MatScalar *v; 356 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 357 358 PetscFunctionBegin; 359 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 360 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 361 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 362 363 v = a->a; 364 xb = x; 365 366 for (i=0; i<mbs; i++) { 367 n = ai[1] - ai[0]; /* length of i_th block row of A */ 368 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 369 ib = aj + *ai; 370 jmin = 0; 371 if (*ib == i){ /* (diag of A)*x */ 372 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 373 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 374 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 375 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 376 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 377 v += 25; jmin++; 378 } 379 for (j=jmin; j<n; j++) { 380 /* (strict lower triangular part of A)*x */ 381 cval = ib[j]*5; 382 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 383 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 384 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 385 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 386 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 387 /* (strict upper triangular part of A)*x */ 388 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 389 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 390 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 391 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 392 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 393 v += 25; 394 } 395 xb +=5; ai++; 396 } 397 398 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 399 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 400 PetscLogFlops(50*(a->nz*2 - A->m) - A->m); 401 PetscFunctionReturn(0); 402 } 403 404 405 #undef __FUNCT__ 406 #define __FUNCT__ "MatMult_SeqSBAIJ_6" 407 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz) 408 { 409 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 410 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0; 411 MatScalar *v; 412 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 413 414 PetscFunctionBegin; 415 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 416 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 417 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 418 419 v = a->a; 420 xb = x; 421 422 for (i=0; i<mbs; i++) { 423 n = ai[1] - ai[0]; /* length of i_th block row of A */ 424 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 425 ib = aj + *ai; 426 jmin = 0; 427 if (*ib == i){ /* (diag of A)*x */ 428 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 429 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 430 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 431 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 432 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 433 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 434 v += 36; jmin++; 435 } 436 for (j=jmin; j<n; j++) { 437 /* (strict lower triangular part of A)*x */ 438 cval = ib[j]*6; 439 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 440 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 441 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 442 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 443 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 444 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 445 /* (strict upper triangular part of A)*x */ 446 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 447 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 448 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 449 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 450 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 451 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 452 v += 36; 453 } 454 xb +=6; ai++; 455 } 456 457 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 458 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 459 PetscLogFlops(72*(a->nz*2 - A->m) - A->m); 460 PetscFunctionReturn(0); 461 } 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatMult_SeqSBAIJ_7" 464 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz) 465 { 466 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 467 PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0; 468 MatScalar *v; 469 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 470 471 PetscFunctionBegin; 472 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 473 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 474 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 475 476 v = a->a; 477 xb = x; 478 479 for (i=0; i<mbs; i++) { 480 n = ai[1] - ai[0]; /* length of i_th block row of A */ 481 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 482 ib = aj + *ai; 483 jmin = 0; 484 if (*ib == i){ /* (diag of A)*x */ 485 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 486 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 487 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 488 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 489 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 490 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 491 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 492 v += 49; jmin++; 493 } 494 for (j=jmin; j<n; j++) { 495 /* (strict lower triangular part of A)*x */ 496 cval = ib[j]*7; 497 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 498 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 499 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 500 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 501 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 502 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 503 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 504 /* (strict upper triangular part of A)*x */ 505 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 506 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 507 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 508 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 509 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 510 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 511 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 512 v += 49; 513 } 514 xb +=7; ai++; 515 } 516 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 517 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 518 PetscLogFlops(98*(a->nz*2 - A->m) - A->m); 519 PetscFunctionReturn(0); 520 } 521 522 /* 523 This will not work with MatScalar == float because it calls the BLAS 524 */ 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatMult_SeqSBAIJ_N" 527 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz) 528 { 529 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 530 PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0; 531 MatScalar *v; 532 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 533 int ncols,k; 534 535 PetscFunctionBegin; 536 ierr = VecSet(&zero,zz);CHKERRQ(ierr); 537 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 538 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 539 540 aj = a->j; 541 v = a->a; 542 ii = a->i; 543 544 if (!a->mult_work) { 545 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 546 } 547 work = a->mult_work; 548 549 for (i=0; i<mbs; i++) { 550 n = ii[1] - ii[0]; ncols = n*bs; 551 workt = work; idx=aj+ii[0]; 552 553 /* upper triangular part */ 554 for (j=0; j<n; j++) { 555 xb = x_ptr + bs*(*idx++); 556 for (k=0; k<bs; k++) workt[k] = xb[k]; 557 workt += bs; 558 } 559 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 560 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 561 562 /* strict lower triangular part */ 563 idx = aj+ii[0]; 564 if (*idx == i){ 565 ncols -= bs; v += bs2; idx++; n--; 566 } 567 568 if (ncols > 0){ 569 workt = work; 570 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 571 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 572 for (j=0; j<n; j++) { 573 zb = z_ptr + bs*(*idx++); 574 for (k=0; k<bs; k++) zb[k] += workt[k] ; 575 workt += bs; 576 } 577 } 578 x += bs; v += n*bs2; z += bs; ii++; 579 } 580 581 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 582 ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 583 PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m); 584 PetscFunctionReturn(0); 585 } 586 587 #undef __FUNCT__ 588 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1" 589 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 590 { 591 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 592 PetscScalar *x,*y,*z,*xb,x1; 593 MatScalar *v; 594 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 595 596 PetscFunctionBegin; 597 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 598 if (yy != xx) { 599 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 600 } else { 601 y = x; 602 } 603 if (zz != yy) { 604 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 605 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 606 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 607 } else { 608 z = y; 609 } 610 611 v = a->a; 612 xb = x; 613 614 for (i=0; i<mbs; i++) { 615 n = ai[1] - ai[0]; /* length of i_th row of A */ 616 x1 = xb[0]; 617 ib = aj + *ai; 618 jmin = 0; 619 if (*ib == i) { /* (diag of A)*x */ 620 z[i] += *v++ * x[*ib++]; jmin++; 621 } 622 for (j=jmin; j<n; j++) { 623 cval = *ib; 624 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */ 625 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */ 626 } 627 xb++; ai++; 628 } 629 630 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 631 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 632 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 633 634 PetscLogFlops(2*(a->nz*2 - A->m)); 635 PetscFunctionReturn(0); 636 } 637 638 #undef __FUNCT__ 639 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2" 640 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 641 { 642 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 643 PetscScalar *x,*y,*z,*xb,x1,x2; 644 MatScalar *v; 645 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 646 647 PetscFunctionBegin; 648 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 649 if (yy != xx) { 650 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 651 } else { 652 y = x; 653 } 654 if (zz != yy) { 655 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 656 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 657 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 658 } else { 659 z = y; 660 } 661 662 v = a->a; 663 xb = x; 664 665 for (i=0; i<mbs; i++) { 666 n = ai[1] - ai[0]; /* length of i_th block row of A */ 667 x1 = xb[0]; x2 = xb[1]; 668 ib = aj + *ai; 669 jmin = 0; 670 if (*ib == i){ /* (diag of A)*x */ 671 z[2*i] += v[0]*x1 + v[2]*x2; 672 z[2*i+1] += v[2]*x1 + v[3]*x2; 673 v += 4; jmin++; 674 } 675 for (j=jmin; j<n; j++) { 676 /* (strict lower triangular part of A)*x */ 677 cval = ib[j]*2; 678 z[cval] += v[0]*x1 + v[1]*x2; 679 z[cval+1] += v[2]*x1 + v[3]*x2; 680 /* (strict upper triangular part of A)*x */ 681 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1]; 682 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1]; 683 v += 4; 684 } 685 xb +=2; ai++; 686 } 687 688 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 689 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 690 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 691 692 PetscLogFlops(4*(a->nz*2 - A->m)); 693 PetscFunctionReturn(0); 694 } 695 696 #undef __FUNCT__ 697 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3" 698 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 699 { 700 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 701 PetscScalar *x,*y,*z,*xb,x1,x2,x3; 702 MatScalar *v; 703 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 704 705 PetscFunctionBegin; 706 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 707 if (yy != xx) { 708 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 709 } else { 710 y = x; 711 } 712 if (zz != yy) { 713 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 714 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 715 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 716 } else { 717 z = y; 718 } 719 720 v = a->a; 721 xb = x; 722 723 for (i=0; i<mbs; i++) { 724 n = ai[1] - ai[0]; /* length of i_th block row of A */ 725 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 726 ib = aj + *ai; 727 jmin = 0; 728 if (*ib == i){ /* (diag of A)*x */ 729 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3; 730 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3; 731 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 732 v += 9; jmin++; 733 } 734 for (j=jmin; j<n; j++) { 735 /* (strict lower triangular part of A)*x */ 736 cval = ib[j]*3; 737 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3; 738 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3; 739 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3; 740 /* (strict upper triangular part of A)*x */ 741 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2]; 742 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2]; 743 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2]; 744 v += 9; 745 } 746 xb +=3; ai++; 747 } 748 749 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 750 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 751 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 752 753 PetscLogFlops(18*(a->nz*2 - A->m)); 754 PetscFunctionReturn(0); 755 } 756 757 #undef __FUNCT__ 758 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4" 759 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 760 { 761 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 762 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4; 763 MatScalar *v; 764 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 765 766 PetscFunctionBegin; 767 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 768 if (yy != xx) { 769 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 770 } else { 771 y = x; 772 } 773 if (zz != yy) { 774 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 775 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 776 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 777 } else { 778 z = y; 779 } 780 781 v = a->a; 782 xb = x; 783 784 for (i=0; i<mbs; i++) { 785 n = ai[1] - ai[0]; /* length of i_th block row of A */ 786 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 787 ib = aj + *ai; 788 jmin = 0; 789 if (*ib == i){ /* (diag of A)*x */ 790 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 791 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 792 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4; 793 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4; 794 v += 16; jmin++; 795 } 796 for (j=jmin; j<n; j++) { 797 /* (strict lower triangular part of A)*x */ 798 cval = ib[j]*4; 799 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 800 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 801 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 802 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 803 /* (strict upper triangular part of A)*x */ 804 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3]; 805 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3]; 806 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3]; 807 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3]; 808 v += 16; 809 } 810 xb +=4; ai++; 811 } 812 813 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 814 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 815 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 816 817 PetscLogFlops(32*(a->nz*2 - A->m)); 818 PetscFunctionReturn(0); 819 } 820 821 #undef __FUNCT__ 822 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5" 823 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 824 { 825 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 826 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5; 827 MatScalar *v; 828 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 829 830 PetscFunctionBegin; 831 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 832 if (yy != xx) { 833 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 834 } else { 835 y = x; 836 } 837 if (zz != yy) { 838 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 839 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 840 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 841 } else { 842 z = y; 843 } 844 845 v = a->a; 846 xb = x; 847 848 for (i=0; i<mbs; i++) { 849 n = ai[1] - ai[0]; /* length of i_th block row of A */ 850 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; 851 ib = aj + *ai; 852 jmin = 0; 853 if (*ib == i){ /* (diag of A)*x */ 854 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5; 855 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5; 856 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5; 857 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5; 858 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 859 v += 25; jmin++; 860 } 861 for (j=jmin; j<n; j++) { 862 /* (strict lower triangular part of A)*x */ 863 cval = ib[j]*5; 864 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 865 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 866 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5; 867 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5; 868 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5; 869 /* (strict upper triangular part of A)*x */ 870 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4]; 871 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4]; 872 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4]; 873 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4]; 874 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4]; 875 v += 25; 876 } 877 xb +=5; ai++; 878 } 879 880 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 881 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 882 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 883 884 PetscLogFlops(50*(a->nz*2 - A->m)); 885 PetscFunctionReturn(0); 886 } 887 #undef __FUNCT__ 888 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6" 889 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 890 { 891 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 892 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6; 893 MatScalar *v; 894 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 895 896 PetscFunctionBegin; 897 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 898 if (yy != xx) { 899 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 900 } else { 901 y = x; 902 } 903 if (zz != yy) { 904 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 905 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 906 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 907 } else { 908 z = y; 909 } 910 911 v = a->a; 912 xb = x; 913 914 for (i=0; i<mbs; i++) { 915 n = ai[1] - ai[0]; /* length of i_th block row of A */ 916 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; 917 ib = aj + *ai; 918 jmin = 0; 919 if (*ib == i){ /* (diag of A)*x */ 920 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6; 921 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6; 922 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6; 923 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6; 924 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6; 925 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 926 v += 36; jmin++; 927 } 928 for (j=jmin; j<n; j++) { 929 /* (strict lower triangular part of A)*x */ 930 cval = ib[j]*6; 931 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6; 932 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6; 933 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6; 934 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6; 935 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6; 936 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6; 937 /* (strict upper triangular part of A)*x */ 938 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5]; 939 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5]; 940 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5]; 941 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5]; 942 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5]; 943 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5]; 944 v += 36; 945 } 946 xb +=6; ai++; 947 } 948 949 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 950 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 951 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 952 953 PetscLogFlops(72*(a->nz*2 - A->m)); 954 PetscFunctionReturn(0); 955 } 956 957 #undef __FUNCT__ 958 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7" 959 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 960 { 961 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 962 PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7; 963 MatScalar *v; 964 int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin; 965 966 PetscFunctionBegin; 967 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 968 if (yy != xx) { 969 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 970 } else { 971 y = x; 972 } 973 if (zz != yy) { 974 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 975 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); 976 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 977 } else { 978 z = y; 979 } 980 981 v = a->a; 982 xb = x; 983 984 for (i=0; i<mbs; i++) { 985 n = ai[1] - ai[0]; /* length of i_th block row of A */ 986 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6]; 987 ib = aj + *ai; 988 jmin = 0; 989 if (*ib == i){ /* (diag of A)*x */ 990 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7; 991 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7; 992 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7; 993 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7; 994 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7; 995 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7; 996 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 997 v += 49; jmin++; 998 } 999 for (j=jmin; j<n; j++) { 1000 /* (strict lower triangular part of A)*x */ 1001 cval = ib[j]*7; 1002 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7; 1003 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7; 1004 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7; 1005 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7; 1006 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7; 1007 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7; 1008 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7; 1009 /* (strict upper triangular part of A)*x */ 1010 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6]; 1011 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6]; 1012 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6]; 1013 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6]; 1014 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6]; 1015 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6]; 1016 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6]; 1017 v += 49; 1018 } 1019 xb +=7; ai++; 1020 } 1021 1022 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1023 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1024 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1025 1026 PetscLogFlops(98*(a->nz*2 - A->m)); 1027 PetscFunctionReturn(0); 1028 } 1029 1030 #undef __FUNCT__ 1031 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N" 1032 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1033 { 1034 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1035 PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt; 1036 MatScalar *v; 1037 int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2; 1038 int ncols,k; 1039 1040 PetscFunctionBegin; 1041 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x; 1042 if (yy != xx) { 1043 ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr); 1044 } else { 1045 y = x; 1046 } 1047 if (zz != yy) { 1048 /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */ 1049 ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z; 1050 ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr); 1051 } else { 1052 z = y; 1053 } 1054 1055 aj = a->j; 1056 v = a->a; 1057 ii = a->i; 1058 1059 if (!a->mult_work) { 1060 ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1061 } 1062 work = a->mult_work; 1063 1064 1065 for (i=0; i<mbs; i++) { 1066 n = ii[1] - ii[0]; ncols = n*bs; 1067 workt = work; idx=aj+ii[0]; 1068 1069 /* upper triangular part */ 1070 for (j=0; j<n; j++) { 1071 xb = x_ptr + bs*(*idx++); 1072 for (k=0; k<bs; k++) workt[k] = xb[k]; 1073 workt += bs; 1074 } 1075 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */ 1076 Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1077 1078 /* strict lower triangular part */ 1079 idx = aj+ii[0]; 1080 if (*idx == i){ 1081 ncols -= bs; v += bs2; idx++; n--; 1082 } 1083 if (ncols > 0){ 1084 workt = work; 1085 ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1086 Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt); 1087 for (j=0; j<n; j++) { 1088 zb = z_ptr + bs*(*idx++); 1089 /* idx++; */ 1090 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1091 workt += bs; 1092 } 1093 } 1094 1095 x += bs; v += n*bs2; z += bs; ii++; 1096 } 1097 1098 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 1099 if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr); 1100 if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr); 1101 1102 PetscLogFlops(2*(a->nz*2 - A->m)); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 #undef __FUNCT__ 1107 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ" 1108 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz) 1109 { 1110 int ierr; 1111 1112 PetscFunctionBegin; 1113 ierr = MatMult(A,xx,zz);CHKERRQ(ierr); 1114 PetscFunctionReturn(0); 1115 } 1116 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ" 1119 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1120 1121 { 1122 int ierr; 1123 1124 PetscFunctionBegin; 1125 ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr); 1126 PetscFunctionReturn(0); 1127 } 1128 1129 #undef __FUNCT__ 1130 #define __FUNCT__ "MatScale_SeqSBAIJ" 1131 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA) 1132 { 1133 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data; 1134 int one = 1,totalnz = a->bs2*a->nz; 1135 1136 PetscFunctionBegin; 1137 BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one); 1138 PetscLogFlops(totalnz); 1139 PetscFunctionReturn(0); 1140 } 1141 1142 #undef __FUNCT__ 1143 #define __FUNCT__ "MatNorm_SeqSBAIJ" 1144 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm) 1145 { 1146 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1147 MatScalar *v = a->a; 1148 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum; 1149 int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j; 1150 int *jl,*il,jmin,jmax,ierr,nexti,ik,*col; 1151 1152 PetscFunctionBegin; 1153 if (type == NORM_FROBENIUS) { 1154 for (k=0; k<mbs; k++){ 1155 jmin = a->i[k]; jmax = a->i[k+1]; 1156 col = aj + jmin; 1157 if (*col == k){ /* diagonal block */ 1158 for (i=0; i<bs2; i++){ 1159 #if defined(PETSC_USE_COMPLEX) 1160 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++; 1161 #else 1162 sum_diag += (*v)*(*v); v++; 1163 #endif 1164 } 1165 jmin++; 1166 } 1167 for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */ 1168 for (i=0; i<bs2; i++){ 1169 #if defined(PETSC_USE_COMPLEX) 1170 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++; 1171 #else 1172 sum_off += (*v)*(*v); v++; 1173 #endif 1174 } 1175 } 1176 } 1177 *norm = sqrt(sum_diag + 2*sum_off); 1178 1179 } else if (type == NORM_INFINITY) { /* maximum row sum */ 1180 ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr); 1181 ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr); 1182 ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr); 1183 for (i=0; i<mbs; i++) { 1184 jl[i] = mbs; il[0] = 0; 1185 } 1186 1187 *norm = 0.0; 1188 for (k=0; k<mbs; k++) { /* k_th block row */ 1189 for (j=0; j<bs; j++) sum[j]=0.0; 1190 1191 /*-- col sum --*/ 1192 i = jl[k]; /* first |A(i,k)| to be added */ 1193 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window) 1194 at step k */ 1195 while (i<mbs){ 1196 nexti = jl[i]; /* next block row to be added */ 1197 ik = il[i]; /* block index of A(i,k) in the array a */ 1198 for (j=0; j<bs; j++){ 1199 v = a->a + ik*bs2 + j*bs; 1200 for (k1=0; k1<bs; k1++) { 1201 sum[j] += PetscAbsScalar(*v); v++; 1202 } 1203 } 1204 /* update il, jl */ 1205 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */ 1206 jmax = a->i[i+1]; 1207 if (jmin < jmax){ 1208 il[i] = jmin; 1209 j = a->j[jmin]; 1210 jl[i] = jl[j]; jl[j]=i; 1211 } 1212 i = nexti; 1213 } 1214 1215 /*-- row sum --*/ 1216 jmin = a->i[k]; jmax = a->i[k+1]; 1217 for (i=jmin; i<jmax; i++) { 1218 for (j=0; j<bs; j++){ 1219 v = a->a + i*bs2 + j; 1220 for (k1=0; k1<bs; k1++){ 1221 sum[j] += PetscAbsScalar(*v); 1222 v += bs; 1223 } 1224 } 1225 } 1226 /* add k_th block row to il, jl */ 1227 col = aj+jmin; 1228 if (*col == k) jmin++; 1229 if (jmin < jmax){ 1230 il[k] = jmin; 1231 j = a->j[jmin]; 1232 jl[k] = jl[j]; jl[j] = k; 1233 } 1234 for (j=0; j<bs; j++){ 1235 if (sum[j] > *norm) *norm = sum[j]; 1236 } 1237 } 1238 ierr = PetscFree(il);CHKERRQ(ierr); 1239 ierr = PetscFree(jl);CHKERRQ(ierr); 1240 ierr = PetscFree(sum);CHKERRQ(ierr); 1241 } else { 1242 SETERRQ(PETSC_ERR_SUP,"No support for this norm yet"); 1243 } 1244 PetscFunctionReturn(0); 1245 } 1246 1247 #undef __FUNCT__ 1248 #define __FUNCT__ "MatEqual_SeqSBAIJ" 1249 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg) 1250 { 1251 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data; 1252 int ierr; 1253 1254 PetscFunctionBegin; 1255 1256 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 1257 if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) { 1258 *flg = PETSC_FALSE; 1259 PetscFunctionReturn(0); 1260 } 1261 1262 /* if the a->i are the same */ 1263 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr); 1264 if (*flg == PETSC_FALSE) { 1265 PetscFunctionReturn(0); 1266 } 1267 1268 /* if a->j are the same */ 1269 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr); 1270 if (*flg == PETSC_FALSE) { 1271 PetscFunctionReturn(0); 1272 } 1273 /* if a->a are the same */ 1274 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 1275 1276 PetscFunctionReturn(0); 1277 } 1278 1279 #undef __FUNCT__ 1280 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ" 1281 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v) 1282 { 1283 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1284 int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 1285 PetscScalar *x,zero = 0.0; 1286 MatScalar *aa,*aa_j; 1287 1288 PetscFunctionBegin; 1289 bs = a->bs; 1290 if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1"); 1291 1292 aa = a->a; 1293 ai = a->i; 1294 aj = a->j; 1295 ambs = a->mbs; 1296 bs2 = a->bs2; 1297 1298 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1299 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1300 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1301 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1302 for (i=0; i<ambs; i++) { 1303 j=ai[i]; 1304 if (aj[j] == i) { /* if this is a diagonal element */ 1305 row = i*bs; 1306 aa_j = aa + j*bs2; 1307 if (A->factor && bs==1){ 1308 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k]; 1309 } else { 1310 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 1311 } 1312 } 1313 } 1314 1315 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1316 PetscFunctionReturn(0); 1317 } 1318 1319 #undef __FUNCT__ 1320 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ" 1321 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr) 1322 { 1323 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1324 PetscScalar *l,*r,x,*li,*ri; 1325 MatScalar *aa,*v; 1326 int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2; 1327 1328 PetscFunctionBegin; 1329 ai = a->i; 1330 aj = a->j; 1331 aa = a->a; 1332 m = A->m; 1333 bs = a->bs; 1334 mbs = a->mbs; 1335 bs2 = a->bs2; 1336 1337 if (ll != rr) { 1338 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n"); 1339 } 1340 if (ll) { 1341 ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr); 1342 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 1343 if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 1344 for (i=0; i<mbs; i++) { /* for each block row */ 1345 M = ai[i+1] - ai[i]; 1346 li = l + i*bs; 1347 v = aa + bs2*ai[i]; 1348 for (j=0; j<M; j++) { /* for each block */ 1349 for (k=0; k<bs2; k++) { 1350 (*v++) *= li[k%bs]; 1351 } 1352 #ifdef CONT 1353 /* will be used to replace the above loop */ 1354 ri = l + bs*aj[ai[i]+j]; 1355 for (k=0; k<bs; k++) { /* column value */ 1356 x = ri[k]; 1357 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x; 1358 } 1359 #endif 1360 1361 } 1362 } 1363 ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr); 1364 PetscLogFlops(2*a->nz); 1365 } 1366 /* will be deleted */ 1367 if (rr) { 1368 ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr); 1369 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 1370 if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 1371 for (i=0; i<mbs; i++) { /* for each block row */ 1372 M = ai[i+1] - ai[i]; 1373 v = aa + bs2*ai[i]; 1374 for (j=0; j<M; j++) { /* for each block */ 1375 ri = r + bs*aj[ai[i]+j]; 1376 for (k=0; k<bs; k++) { 1377 x = ri[k]; 1378 for (tmp=0; tmp<bs; tmp++) (*v++) *= x; 1379 } 1380 } 1381 } 1382 ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr); 1383 PetscLogFlops(a->nz); 1384 } 1385 PetscFunctionReturn(0); 1386 } 1387 1388 #undef __FUNCT__ 1389 #define __FUNCT__ "MatGetInfo_SeqSBAIJ" 1390 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info) 1391 { 1392 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1393 1394 PetscFunctionBegin; 1395 info->rows_global = (double)A->m; 1396 info->columns_global = (double)A->m; 1397 info->rows_local = (double)A->m; 1398 info->columns_local = (double)A->m; 1399 info->block_size = a->bs2; 1400 info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */ 1401 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */ 1402 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 1403 info->assemblies = A->num_ass; 1404 info->mallocs = a->reallocs; 1405 info->memory = A->mem; 1406 if (A->factor) { 1407 info->fill_ratio_given = A->info.fill_ratio_given; 1408 info->fill_ratio_needed = A->info.fill_ratio_needed; 1409 info->factor_mallocs = A->info.factor_mallocs; 1410 } else { 1411 info->fill_ratio_given = 0; 1412 info->fill_ratio_needed = 0; 1413 info->factor_mallocs = 0; 1414 } 1415 PetscFunctionReturn(0); 1416 } 1417 1418 1419 #undef __FUNCT__ 1420 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ" 1421 int MatZeroEntries_SeqSBAIJ(Mat A) 1422 { 1423 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1424 int ierr; 1425 1426 PetscFunctionBegin; 1427 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 1428 PetscFunctionReturn(0); 1429 } 1430 1431 #undef __FUNCT__ 1432 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ" 1433 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v) 1434 { 1435 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1436 int ierr,i,j,n,row,col,bs,*ai,*aj,mbs; 1437 PetscReal atmp; 1438 MatScalar *aa; 1439 PetscScalar zero = 0.0,*x; 1440 int ncols,brow,bcol,krow,kcol; 1441 1442 PetscFunctionBegin; 1443 if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1444 bs = a->bs; 1445 aa = a->a; 1446 ai = a->i; 1447 aj = a->j; 1448 mbs = a->mbs; 1449 1450 ierr = VecSet(&zero,v);CHKERRQ(ierr); 1451 ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr); 1452 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 1453 if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 1454 for (i=0; i<mbs; i++) { 1455 ncols = ai[1] - ai[0]; ai++; 1456 brow = bs*i; 1457 for (j=0; j<ncols; j++){ 1458 bcol = bs*(*aj); 1459 for (kcol=0; kcol<bs; kcol++){ 1460 col = bcol + kcol; /* col index */ 1461 for (krow=0; krow<bs; krow++){ 1462 atmp = PetscAbsScalar(*aa); aa++; 1463 row = brow + krow; /* row index */ 1464 /* printf("val[%d,%d]: %g\n",row,col,atmp); */ 1465 if (PetscRealPart(x[row]) < atmp) x[row] = atmp; 1466 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp; 1467 } 1468 } 1469 aj++; 1470 } 1471 } 1472 ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr); 1473 PetscFunctionReturn(0); 1474 } 1475