1 #ifndef lint 2 static char vcid[] = "$Id: baijfact.c,v 1.3 1996/02/17 05:41:36 bsmith Exp bsmith $"; 3 #endif 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "baij.h" 9 10 /* 11 The symbolic factorization code is identical to that for AIJ format, 12 except for very small changes since this is now a SeqBAIJ datastructure. 13 NOT good code reuse. 14 */ 15 int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 18 IS isicol; 19 int *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j; 20 int *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs; 21 int *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im; 22 23 if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square"); 24 if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation"); 25 if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation"); 26 27 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 28 ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic); 29 30 /* get new row pointers */ 31 ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 32 ainew[0] = 0; 33 /* don't know how many column pointers are needed so estimate */ 34 jmax = (int) (f*ai[n] + 1); 35 ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 36 /* fill is a linked list of nonzeros in active row */ 37 fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill); 38 im = fill + n + 1; 39 /* idnew is location of diagonal in factor */ 40 idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew); 41 idnew[0] = 0; 42 43 for ( i=0; i<n; i++ ) { 44 /* first copy previous fill into linked list */ 45 nnz = nz = ai[r[i]+1] - ai[r[i]]; 46 ajtmp = aj + ai[r[i]]; 47 fill[n] = n; 48 while (nz--) { 49 fm = n; 50 idx = ic[*ajtmp++]; 51 do { 52 m = fm; 53 fm = fill[m]; 54 } while (fm < idx); 55 fill[m] = idx; 56 fill[idx] = fm; 57 } 58 row = fill[n]; 59 while ( row < i ) { 60 ajtmp = ajnew + idnew[row] + 1; 61 nzbd = 1 + idnew[row] - ainew[row]; 62 nz = im[row] - nzbd; 63 fm = row; 64 while (nz-- > 0) { 65 idx = *ajtmp++; 66 nzbd++; 67 if (idx == i) im[row] = nzbd; 68 do { 69 m = fm; 70 fm = fill[m]; 71 } while (fm < idx); 72 if (fm != idx) { 73 fill[m] = idx; 74 fill[idx] = fm; 75 fm = idx; 76 nnz++; 77 } 78 } 79 row = fill[row]; 80 } 81 /* copy new filled row into permanent storage */ 82 ainew[i+1] = ainew[i] + nnz; 83 if (ainew[i+1] > jmax+1) { 84 /* allocate a longer ajnew */ 85 int maxadd; 86 maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n); 87 if (maxadd < nnz) maxadd = (n-i)*(nnz+1); 88 jmax += maxadd; 89 ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp); 90 PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int)); 91 PetscFree(ajnew); 92 ajnew = ajtmp; 93 realloc++; /* count how many times we realloc */ 94 } 95 ajtmp = ajnew + ainew[i]; 96 fm = fill[n]; 97 nzi = 0; 98 im[i] = nnz; 99 while (nnz--) { 100 if (fm < i) nzi++; 101 *ajtmp++ = fm; 102 fm = fill[fm]; 103 } 104 idnew[i] = ainew[i] + nzi; 105 } 106 107 PLogInfo((PetscObject)A, 108 "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n", 109 realloc,f,((double)ainew[n])/((double)ai[i])); 110 111 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 112 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 113 114 PetscFree(fill); 115 116 /* put together the new matrix */ 117 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr); 118 PLogObjectParent(*B,isicol); 119 ierr = ISDestroy(isicol); CHKERRQ(ierr); 120 b = (Mat_SeqBAIJ *) (*B)->data; 121 PetscFree(b->imax); 122 b->singlemalloc = 0; 123 len = ainew[n]*sizeof(Scalar); 124 /* the next line frees the default space generated by the Create() */ 125 PetscFree(b->a); PetscFree(b->ilen); 126 b->a = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a); 127 b->j = ajnew; 128 b->i = ainew; 129 b->diag = idnew; 130 b->ilen = 0; 131 b->imax = 0; 132 b->row = isrow; 133 b->col = iscol; 134 b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 135 CHKPTRQ(b->solve_work); 136 /* In b structure: Free imax, ilen, old a, old j. 137 Allocate idnew, solve_work, new a, new j */ 138 PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar))); 139 b->maxnz = b->nz = ainew[n]; 140 141 return 0; 142 } 143 144 #include "pinclude/plapack.h" 145 146 #define BlockZero(v,idx) {PetscMemzero(v+bs2*(idx),bs2*sizeof(Scalar));} 147 148 #define BlockCopy(v_in,v_out,idx_in,idx_out) \ 149 {PetscMemcpy(v_out + bs2*(idx_out),v_in + bs2*(idx_in),bs2*sizeof(Scalar));} 150 151 #define BlockInvert(vv,idx) \ 152 { int _i,info; Scalar *w = vv + bs2*idx; \ 153 LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); \ 154 PetscMemzero(v_work,bs2*sizeof(Scalar)); \ 155 for ( _i=0; _i<bs; _i++ ) { v_work[_i + bs*_i] = 1.0; } \ 156 LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);\ 157 PetscMemcpy(w,v_work,bs2*sizeof(Scalar)); \ 158 } 159 160 #define BlockMult(v1,v2,v3) \ 161 {Scalar _DOne=1.0, _DZero=0.0;\ 162 BLgemm_("N","N",&bs,&bs,&bs,&_DOne,v1,&bs,v2,&bs,&_DZero,v3,&bs);} 163 164 #define BlockMultSub(v1,v2,v3,idx2,idx3) \ 165 {Scalar _DOne=1.0, _DMOne=-1.0;\ 166 BLgemm_("N","N",&bs,&bs,&bs,&_DMOne,v1,&bs,v2+bs2*idx2,&bs,&_DOne,v3+bs2*idx3,&bs);} 167 168 /* ----------------------------------------------------------- */ 169 int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B) 170 { 171 Mat C = *B; 172 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data; 173 IS iscol = b->col, isrow = b->row, isicol; 174 int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 175 int *ajtmpold, *ajtmp, nz, row, bslog; 176 int *diag_offset = b->diag,diag,bs = a->bs,bs2 = bs*bs, *v_pivots; 177 register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc; 178 register int *pj; 179 180 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 181 PLogObjectParent(*B,isicol); 182 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 183 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 184 rtmp = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 185 186 /* generate work space needed by dense LU factorization */ 187 v_work = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work); 188 multiplier = v_work + bs2; 189 v_pivots = (int *) (multiplier + bs2); 190 191 /* flops in while loop */ 192 bslog = 4*bs*bs2 - bs; 193 194 for ( i=0; i<n; i++ ) { 195 nz = ai[i+1] - ai[i]; 196 ajtmp = aj + ai[i]; 197 for ( j=0; j<nz; j++ ) BlockZero(rtmp,ajtmp[j]); 198 /* load in initial (unfactored row) */ 199 nz = a->i[r[i]+1] - a->i[r[i]]; 200 ajtmpold = a->j + a->i[r[i]]; 201 v = a->a + bs2*a->i[r[i]]; 202 for ( j=0; j<nz; j++ ) BlockCopy(v,rtmp,j,ic[ajtmpold[j]]); 203 row = *ajtmp++; 204 while (row < i) { 205 pc = rtmp + bs2*row; 206 /* if (*pc) { */ 207 pv = b->a + bs2*diag_offset[row]; 208 pj = b->j + diag_offset[row] + 1; 209 BlockMult(pc,pv,multiplier); 210 BlockCopy(multiplier,pc,0,0); 211 nz = ai[row+1] - diag_offset[row] - 1; 212 pv += bs2; 213 for (j=0; j<nz; j++) BlockMultSub(multiplier,pv,rtmp,j,pj[j]); 214 PLogFlops(bslog*nz); 215 /* } */ 216 row = *ajtmp++; 217 } 218 /* finished row so stick it into b->a */ 219 pv = b->a + bs2*ai[i]; 220 pj = b->j + ai[i]; 221 nz = ai[i+1] - ai[i]; 222 for ( j=0; j<nz; j++ ) BlockCopy(rtmp,pv,pj[j],j); 223 diag = diag_offset[i] - ai[i]; 224 /* invert diagonal block */ 225 BlockInvert(pv,diag); 226 } 227 228 PetscFree(rtmp); PetscFree(v_work); 229 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 230 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 231 ierr = ISDestroy(isicol); CHKERRQ(ierr); 232 C->factor = FACTOR_LU; 233 C->assembled = PETSC_TRUE; 234 PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 235 return 0; 236 } 237 238 /* ----------------------------------------------------------- */ 239 /* 240 Version for when blocks are 1 by 1. 241 */ 242 int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B) 243 { 244 Mat C = *B; 245 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data; 246 IS iscol = b->col, isrow = b->row, isicol; 247 int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 248 int *ajtmpold, *ajtmp, nz, row; 249 int *diag_offset = b->diag,diag; 250 register Scalar *pv,*v,*rtmp,multiplier,*pc; 251 register int *pj; 252 253 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 254 PLogObjectParent(*B,isicol); 255 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 256 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 257 rtmp = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 258 259 for ( i=0; i<n; i++ ) { 260 nz = ai[i+1] - ai[i]; 261 ajtmp = aj + ai[i]; 262 for ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0; 263 264 /* load in initial (unfactored row) */ 265 nz = a->i[r[i]+1] - a->i[r[i]]; 266 ajtmpold = a->j + a->i[r[i]]; 267 v = a->a + a->i[r[i]]; 268 for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] = v[j]; 269 270 row = *ajtmp++; 271 while (row < i) { 272 pc = rtmp + row; 273 if (*pc != 0.0) { 274 pv = b->a + diag_offset[row]; 275 pj = b->j + diag_offset[row] + 1; 276 multiplier = *pc * *pv++; 277 *pc = multiplier; 278 nz = ai[row+1] - diag_offset[row] - 1; 279 for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 280 PLogFlops(2*nz); 281 } 282 row = *ajtmp++; 283 } 284 /* finished row so stick it into b->a */ 285 pv = b->a + ai[i]; 286 pj = b->j + ai[i]; 287 nz = ai[i+1] - ai[i]; 288 for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];} 289 diag = diag_offset[i] - ai[i]; 290 /* check pivot entry for current row */ 291 if (pv[diag] == 0.0) { 292 SETERRQ(1,"MatLUFactorNumeric_SeqAIJ:Zero pivot"); 293 } 294 pv[diag] = 1.0/pv[diag]; 295 } 296 297 PetscFree(rtmp); 298 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 299 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 300 ierr = ISDestroy(isicol); CHKERRQ(ierr); 301 C->factor = FACTOR_LU; 302 C->assembled = PETSC_TRUE; 303 PLogFlops(b->n); 304 return 0; 305 } 306 307 /* ----------------------------------------------------------- */ 308 int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f) 309 { 310 Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data; 311 int ierr; 312 Mat C; 313 314 ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr); 315 ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr); 316 317 /* free all the data structures from mat */ 318 PetscFree(mat->a); 319 if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);} 320 if (mat->diag) PetscFree(mat->diag); 321 if (mat->ilen) PetscFree(mat->ilen); 322 if (mat->imax) PetscFree(mat->imax); 323 if (mat->solve_work) PetscFree(mat->solve_work); 324 PetscFree(mat); 325 326 PetscMemcpy(A,C,sizeof(struct _Mat)); 327 PetscHeaderDestroy(C); 328 return 0; 329 } 330 /* ----------------------------------------------------------- */ 331 int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx) 332 { 333 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 334 IS iscol = a->col, isrow = a->row; 335 int *r,*c, ierr, i, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 336 int nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1; 337 Scalar *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0; 338 Scalar _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 339 register Scalar *x, *b, *lsum, *tmp, *v; 340 341 if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix"); 342 343 ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba; 344 ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa; 345 tmp = a->solve_work; 346 347 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 348 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1); 349 350 switch (bs) { 351 case 1: 352 /* forward solve the lower triangular */ 353 tmp[0] = b[*r++]; 354 for ( i=1; i<n; i++ ) { 355 v = aa + ai[i]; 356 vi = aj + ai[i]; 357 nz = a->diag[i] - ai[i]; 358 sum1 = b[*r++]; 359 while (nz--) { 360 sum1 -= (*v++)*tmp[*vi++]; 361 } 362 tmp[i] = sum1; 363 } 364 /* backward solve the upper triangular */ 365 for ( i=n-1; i>=0; i-- ){ 366 v = aa + a->diag[i] + 1; 367 vi = aj + a->diag[i] + 1; 368 nz = ai[i+1] - a->diag[i] - 1; 369 sum1 = tmp[i]; 370 while (nz--) { 371 sum1 -= (*v++)*tmp[*vi++]; 372 } 373 x[*c--] = tmp[i] = aa[a->diag[i]]*sum1; 374 } 375 break; 376 case 2: 377 /* forward solve the lower triangular */ 378 idx = 2*(*r++); 379 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 380 for ( i=1; i<n; i++ ) { 381 v = aa + 4*ai[i]; 382 vi = aj + ai[i]; 383 nz = a->diag[i] - ai[i]; 384 idx = 2*(*r++); 385 sum1 = b[idx]; sum2 = b[1+idx]; 386 while (nz--) { 387 idx = 2*(*vi++); 388 x1 = tmp[idx]; x2 = tmp[1+idx]; 389 sum1 -= v[0]*x1 + v[2]*x2; 390 sum2 -= v[1]*x1 + v[3]*x2; 391 v += 4; 392 } 393 idx = 2*i; 394 tmp[idx] = sum1; tmp[1+idx] = sum2; 395 } 396 /* backward solve the upper triangular */ 397 for ( i=n-1; i>=0; i-- ){ 398 v = aa + 4*a->diag[i] + 4; 399 vi = aj + a->diag[i] + 1; 400 nz = ai[i+1] - a->diag[i] - 1; 401 idt = 2*i; 402 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 403 while (nz--) { 404 idx = 2*(*vi++); 405 x1 = tmp[idx]; x2 = tmp[1+idx]; 406 sum1 -= v[0]*x1 + v[2]*x2; 407 sum2 -= v[1]*x1 + v[3]*x2; 408 v += 4; 409 } 410 idc = 2*(*c--); 411 v = aa + 4*a->diag[i]; 412 x[idc] = tmp[idt] = v[0]*sum1 + v[2]*sum2; 413 x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2; 414 } 415 break; 416 case 3: 417 /* forward solve the lower triangular */ 418 idx = 3*(*r++); 419 tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx]; 420 for ( i=1; i<n; i++ ) { 421 v = aa + 9*ai[i]; 422 vi = aj + ai[i]; 423 nz = a->diag[i] - ai[i]; 424 idx = 3*(*r++); 425 sum1 = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx]; 426 while (nz--) { 427 idx = 3*(*vi++); 428 x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 429 sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 430 sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 431 sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 432 v += 9; 433 } 434 idx = 3*i; 435 tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3; 436 } 437 /* backward solve the upper triangular */ 438 for ( i=n-1; i>=0; i-- ){ 439 v = aa + 9*a->diag[i] + 9; 440 vi = aj + a->diag[i] + 1; 441 nz = ai[i+1] - a->diag[i] - 1; 442 idt = 3*i; 443 sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt]; 444 while (nz--) { 445 idx = 3*(*vi++); 446 x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 447 sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 448 sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 449 sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 450 v += 9; 451 } 452 idc = 3*(*c--); 453 v = aa + 9*a->diag[i]; 454 x[idc] = tmp[idt] = v[0]*sum1 + v[3]*sum2 + v[6]*sum3; 455 x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3; 456 x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3; 457 } 458 break; 459 case 4: 460 /* forward solve the lower triangular */ 461 idx = 4*(*r++); 462 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 463 tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; 464 for ( i=1; i<n; i++ ) { 465 v = aa + 16*ai[i]; 466 vi = aj + ai[i]; 467 nz = a->diag[i] - ai[i]; 468 idx = 4*(*r++); 469 sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 470 while (nz--) { 471 idx = 4*(*vi++); 472 x1 = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx]; 473 sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 474 sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 475 sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 476 sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 477 v += 16; 478 } 479 idx = 4*i; 480 tmp[idx] = sum1;tmp[1+idx] = sum2; 481 tmp[2+idx] = sum3;tmp[3+idx] = sum4; 482 } 483 /* backward solve the upper triangular */ 484 for ( i=n-1; i>=0; i-- ){ 485 v = aa + 16*a->diag[i] + 16; 486 vi = aj + a->diag[i] + 1; 487 nz = ai[i+1] - a->diag[i] - 1; 488 idt = 4*i; 489 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 490 sum3 = tmp[2+idt];sum4 = tmp[3+idt]; 491 while (nz--) { 492 idx = 4*(*vi++); 493 x1 = tmp[idx]; x2 = tmp[1+idx]; 494 x3 = tmp[2+idx]; x4 = tmp[3+idx]; 495 sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 496 sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 497 sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 498 sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 499 v += 16; 500 } 501 idc = 4*(*c--); 502 v = aa + 16*a->diag[i]; 503 x[idc] = tmp[idt] = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4; 504 x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4; 505 x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4; 506 x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4; 507 } 508 break; 509 case 5: 510 /* forward solve the lower triangular */ 511 idx = 5*(*r++); 512 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 513 tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx]; 514 for ( i=1; i<n; i++ ) { 515 v = aa + 25*ai[i]; 516 vi = aj + ai[i]; 517 nz = a->diag[i] - ai[i]; 518 idx = 5*(*r++); 519 sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 520 sum5 = b[4+idx]; 521 while (nz--) { 522 idx = 5*(*vi++); 523 x1 = tmp[idx]; x2 = tmp[1+idx];x3 = tmp[2+idx]; 524 x4 = tmp[3+idx];x5 = tmp[4+idx]; 525 sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 526 sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 527 sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 528 sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 529 sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 530 v += 25; 531 } 532 idx = 5*i; 533 tmp[idx] = sum1;tmp[1+idx] = sum2; 534 tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5; 535 } 536 /* backward solve the upper triangular */ 537 for ( i=n-1; i>=0; i-- ){ 538 v = aa + 25*a->diag[i] + 25; 539 vi = aj + a->diag[i] + 1; 540 nz = ai[i+1] - a->diag[i] - 1; 541 idt = 5*i; 542 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 543 sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt]; 544 while (nz--) { 545 idx = 5*(*vi++); 546 x1 = tmp[idx]; x2 = tmp[1+idx]; 547 x3 = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx]; 548 sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 549 sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 550 sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 551 sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 552 sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 553 v += 25; 554 } 555 idc = 5*(*c--); 556 v = aa + 25*a->diag[i]; 557 x[idc] = tmp[idt] = v[0]*sum1+v[5]*sum2+v[10]*sum3+ 558 v[15]*sum4+v[20]*sum5; 559 x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+ 560 v[16]*sum4+v[21]*sum5; 561 x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+ 562 v[17]*sum4+v[22]*sum5; 563 x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+ 564 v[18]*sum4+v[23]*sum5; 565 x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+ 566 v[19]*sum4+v[24]*sum5; 567 } 568 break; 569 default: { 570 /* forward solve the lower triangular */ 571 PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar)); 572 for ( i=1; i<n; i++ ) { 573 v = aa + bs2*ai[i]; 574 vi = aj + ai[i]; 575 nz = a->diag[i] - ai[i]; 576 sum = tmp + bs*i; 577 PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar)); 578 while (nz--) { 579 LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One); 580 v += bs2; 581 } 582 } 583 /* backward solve the upper triangular */ 584 lsum = a->solve_work + a->n; 585 for ( i=n-1; i>=0; i-- ){ 586 v = aa + bs2*(a->diag[i] + 1); 587 vi = aj + a->diag[i] + 1; 588 nz = ai[i+1] - a->diag[i] - 1; 589 PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar)); 590 while (nz--) { 591 LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One); 592 v += bs2; 593 } 594 LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero, 595 tmp+i*bs,&_One); 596 PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar)); 597 } 598 } 599 } 600 601 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 602 ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr); 603 PLogFlops(2*bs2*a->nz - a->n); 604 return 0; 605 } 606 607 /* ----------------------------------------------------------------*/ 608 /* 609 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 610 except that the data structure of Mat_SeqAIJ is slightly different. 611 Not a good example of code reuse. 612 */ 613 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels, 614 Mat *fact) 615 { 616 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 617 IS isicol; 618 int *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j; 619 int *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev; 620 int *dloc, idx, row,m,fm, nzf, nzi,len, realloc = 0; 621 int incrlev,nnz,i,bs = a->bs; 622 623 if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square"); 624 if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation"); 625 if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation"); 626 627 /* special case that simply copies fill pattern */ 628 if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) { 629 ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr); 630 (*fact)->factor = FACTOR_LU; 631 b = (Mat_SeqBAIJ *) (*fact)->data; 632 if (!b->diag) { 633 ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr); 634 } 635 b->row = isrow; 636 b->col = iscol; 637 b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 638 return 0; 639 } 640 641 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 642 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 643 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 644 645 /* get new row pointers */ 646 ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 647 ainew[0] = 0; 648 /* don't know how many column pointers are needed so estimate */ 649 jmax = (int) (f*ai[n] + 1); 650 ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 651 /* ajfill is level of fill for each fill entry */ 652 ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill); 653 /* fill is a linked list of nonzeros in active row */ 654 fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill); 655 /* im is level for each filled value */ 656 im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im); 657 /* dloc is location of diagonal in factor */ 658 dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc); 659 dloc[0] = 0; 660 for ( prow=0; prow<n; prow++ ) { 661 /* first copy previous fill into linked list */ 662 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 663 xi = aj + ai[r[prow]]; 664 fill[n] = n; 665 while (nz--) { 666 fm = n; 667 idx = ic[*xi++]; 668 do { 669 m = fm; 670 fm = fill[m]; 671 } while (fm < idx); 672 fill[m] = idx; 673 fill[idx] = fm; 674 im[idx] = 0; 675 } 676 nzi = 0; 677 row = fill[n]; 678 while ( row < prow ) { 679 incrlev = im[row] + 1; 680 nz = dloc[row]; 681 xi = ajnew + ainew[row] + nz; 682 flev = ajfill + ainew[row] + nz + 1; 683 nnz = ainew[row+1] - ainew[row] - nz - 1; 684 if (*xi++ != row) { 685 SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot"); 686 } 687 fm = row; 688 while (nnz-- > 0) { 689 idx = *xi++; 690 if (*flev + incrlev > levels) { 691 flev++; 692 continue; 693 } 694 do { 695 m = fm; 696 fm = fill[m]; 697 } while (fm < idx); 698 if (fm != idx) { 699 im[idx] = *flev + incrlev; 700 fill[m] = idx; 701 fill[idx] = fm; 702 fm = idx; 703 nzf++; 704 } 705 else { 706 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 707 } 708 flev++; 709 } 710 row = fill[row]; 711 nzi++; 712 } 713 /* copy new filled row into permanent storage */ 714 ainew[prow+1] = ainew[prow] + nzf; 715 if (ainew[prow+1] > jmax) { 716 /* allocate a longer ajnew */ 717 int maxadd; 718 maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); 719 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 720 jmax += maxadd; 721 xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 722 PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int)); 723 PetscFree(ajnew); 724 ajnew = xi; 725 /* allocate a longer ajfill */ 726 xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 727 PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int)); 728 PetscFree(ajfill); 729 ajfill = xi; 730 realloc++; 731 } 732 xi = ajnew + ainew[prow]; 733 flev = ajfill + ainew[prow]; 734 dloc[prow] = nzi; 735 fm = fill[n]; 736 while (nzf--) { 737 *xi++ = fm; 738 *flev++ = im[fm]; 739 fm = fill[fm]; 740 } 741 } 742 PetscFree(ajfill); 743 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 744 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 745 ierr = ISDestroy(isicol); CHKERRQ(ierr); 746 PetscFree(fill); PetscFree(im); 747 748 PLogInfo((PetscObject)A, 749 "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n", 750 realloc,f,((double)ainew[n])/((double)ai[prow])); 751 752 /* put together the new matrix */ 753 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 754 b = (Mat_SeqBAIJ *) (*fact)->data; 755 PetscFree(b->imax); 756 b->singlemalloc = 0; 757 len = bs*bs*ainew[n]*sizeof(Scalar); 758 /* the next line frees the default space generated by the Create() */ 759 PetscFree(b->a); PetscFree(b->ilen); 760 b->a = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a); 761 b->j = ajnew; 762 b->i = ainew; 763 for ( i=0; i<n; i++ ) dloc[i] += ainew[i]; 764 b->diag = dloc; 765 b->ilen = 0; 766 b->imax = 0; 767 b->row = isrow; 768 b->col = iscol; 769 b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 770 CHKPTRQ(b->solve_work); 771 /* In b structure: Free imax, ilen, old a, old j. 772 Allocate dloc, solve_work, new a, new j */ 773 PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar)); 774 b->maxnz = b->nz = ainew[n]; 775 (*fact)->factor = FACTOR_LU; 776 return 0; 777 } 778 779 780 781 782