1 #ifndef lint 2 static char vcid[] = "$Id: baijfact.c,v 1.2 1996/02/15 04:38:54 bsmith Exp bsmith $"; 3 #endif 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "baij.h" 9 10 /* 11 The symbolic factorization code is identical to that for AIJ format, 12 except for very small changes since this is now a SeqBAIJ datastructure. 13 NOT good code reuse. 14 */ 15 int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 18 IS isicol; 19 int *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j; 20 int *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs; 21 int *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im; 22 23 if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square"); 24 if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation"); 25 if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation"); 26 27 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 28 ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic); 29 30 /* get new row pointers */ 31 ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 32 ainew[0] = 0; 33 /* don't know how many column pointers are needed so estimate */ 34 jmax = (int) (f*ai[n] + 1); 35 ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 36 /* fill is a linked list of nonzeros in active row */ 37 fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill); 38 im = fill + n + 1; 39 /* idnew is location of diagonal in factor */ 40 idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew); 41 idnew[0] = 0; 42 43 for ( i=0; i<n; i++ ) { 44 /* first copy previous fill into linked list */ 45 nnz = nz = ai[r[i]+1] - ai[r[i]]; 46 ajtmp = aj + ai[r[i]]; 47 fill[n] = n; 48 while (nz--) { 49 fm = n; 50 idx = ic[*ajtmp++]; 51 do { 52 m = fm; 53 fm = fill[m]; 54 } while (fm < idx); 55 fill[m] = idx; 56 fill[idx] = fm; 57 } 58 row = fill[n]; 59 while ( row < i ) { 60 ajtmp = ajnew + idnew[row] + 1; 61 nzbd = 1 + idnew[row] - ainew[row]; 62 nz = im[row] - nzbd; 63 fm = row; 64 while (nz-- > 0) { 65 idx = *ajtmp++; 66 nzbd++; 67 if (idx == i) im[row] = nzbd; 68 do { 69 m = fm; 70 fm = fill[m]; 71 } while (fm < idx); 72 if (fm != idx) { 73 fill[m] = idx; 74 fill[idx] = fm; 75 fm = idx; 76 nnz++; 77 } 78 } 79 row = fill[row]; 80 } 81 /* copy new filled row into permanent storage */ 82 ainew[i+1] = ainew[i] + nnz; 83 if (ainew[i+1] > jmax+1) { 84 /* allocate a longer ajnew */ 85 int maxadd; 86 maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n); 87 if (maxadd < nnz) maxadd = (n-i)*(nnz+1); 88 jmax += maxadd; 89 ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp); 90 PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int)); 91 PetscFree(ajnew); 92 ajnew = ajtmp; 93 realloc++; /* count how many times we realloc */ 94 } 95 ajtmp = ajnew + ainew[i]; 96 fm = fill[n]; 97 nzi = 0; 98 im[i] = nnz; 99 while (nnz--) { 100 if (fm < i) nzi++; 101 *ajtmp++ = fm; 102 fm = fill[fm]; 103 } 104 idnew[i] = ainew[i] + nzi; 105 } 106 107 PLogInfo((PetscObject)A, 108 "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n", 109 realloc,f,((double)ainew[n])/((double)ai[i])); 110 111 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 112 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 113 114 PetscFree(fill); 115 116 /* put together the new matrix */ 117 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr); 118 PLogObjectParent(*B,isicol); 119 ierr = ISDestroy(isicol); CHKERRQ(ierr); 120 b = (Mat_SeqBAIJ *) (*B)->data; 121 PetscFree(b->imax); 122 b->singlemalloc = 0; 123 len = ainew[n]*sizeof(Scalar); 124 /* the next line frees the default space generated by the Create() */ 125 PetscFree(b->a); PetscFree(b->ilen); 126 b->a = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a); 127 b->j = ajnew; 128 b->i = ainew; 129 b->diag = idnew; 130 b->ilen = 0; 131 b->imax = 0; 132 b->row = isrow; 133 b->col = iscol; 134 b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 135 CHKPTRQ(b->solve_work); 136 /* In b structure: Free imax, ilen, old a, old j. 137 Allocate idnew, solve_work, new a, new j */ 138 PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar))); 139 b->maxnz = b->nz = ainew[n]; 140 141 return 0; 142 } 143 144 #include "pinclude/plapack.h" 145 146 #define BlockZero(v,idx) {PetscMemzero(v+bs2*(idx),bs2*sizeof(Scalar));} 147 148 #define BlockCopy(v_in,v_out,idx_in,idx_out) \ 149 {PetscMemcpy(v_out + bs2*(idx_out),v_in + bs2*(idx_in),bs2*sizeof(Scalar));} 150 151 #define BlockInvert(vv,idx) \ 152 { int _i,info; Scalar *w = vv + bs2*idx; \ 153 LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); \ 154 PetscMemzero(v_work,bs2*sizeof(Scalar)); \ 155 for ( _i=0; _i<bs; _i++ ) { v_work[_i + bs*_i] = 1.0; } \ 156 LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);\ 157 PetscMemcpy(w,v_work,bs2*sizeof(Scalar)); \ 158 } 159 160 #define BlockMult(v1,v2,v3) \ 161 {Scalar _DOne=1.0, _DZero=0.0;\ 162 BLgemm_("N","N",&bs,&bs,&bs,&_DOne,v1,&bs,v2,&bs,&_DZero,v3,&bs);} 163 164 #define BlockMultSub(v1,v2,v3,idx2,idx3) \ 165 {Scalar _DOne=1.0, _DMOne=-1.0;\ 166 BLgemm_("N","N",&bs,&bs,&bs,&_DMOne,v1,&bs,v2+bs2*idx2,&bs,&_DOne,v3+bs2*idx3,&bs);} 167 168 /* ----------------------------------------------------------- */ 169 int MatLUFactorNumeric_SeqBAIJ(Mat A,Mat *B) 170 { 171 Mat C = *B; 172 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data; 173 IS iscol = b->col, isrow = b->row, isicol; 174 int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 175 int *ajtmpold, *ajtmp, nz, row, bslog; 176 int *diag_offset = b->diag,diag,bs = a->bs,bs2 = bs*bs, *v_pivots; 177 register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc; 178 register int *pj; 179 180 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 181 PLogObjectParent(*B,isicol); 182 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 183 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 184 rtmp = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 185 186 /* generate work space needed by dense LU factorization */ 187 v_work = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work); 188 multiplier = v_work + bs2; 189 v_pivots = (int *) (multiplier + bs2); 190 191 /* flops in while loop */ 192 bslog = 4*bs*bs2 - bs; 193 194 for ( i=0; i<n; i++ ) { 195 nz = ai[i+1] - ai[i]; 196 ajtmp = aj + ai[i]; 197 for ( j=0; j<nz; j++ ) BlockZero(rtmp,ajtmp[j]); 198 /* load in initial (unfactored row) */ 199 nz = a->i[r[i]+1] - a->i[r[i]]; 200 ajtmpold = a->j + a->i[r[i]]; 201 v = a->a + bs2*a->i[r[i]]; 202 for ( j=0; j<nz; j++ ) BlockCopy(v,rtmp,j,ic[ajtmpold[j]]); 203 row = *ajtmp++; 204 while (row < i) { 205 pc = rtmp + bs2*row; 206 /* if (*pc) { */ 207 pv = b->a + bs2*diag_offset[row]; 208 pj = b->j + diag_offset[row] + 1; 209 BlockMult(pc,pv,multiplier); 210 BlockCopy(multiplier,pc,0,0); 211 nz = ai[row+1] - diag_offset[row] - 1; 212 pv += bs2; 213 for (j=0; j<nz; j++) BlockMultSub(multiplier,pv,rtmp,j,pj[j]); 214 PLogFlops(bslog*nz); 215 /* } */ 216 row = *ajtmp++; 217 } 218 /* finished row so stick it into b->a */ 219 pv = b->a + bs2*ai[i]; 220 pj = b->j + ai[i]; 221 nz = ai[i+1] - ai[i]; 222 for ( j=0; j<nz; j++ ) BlockCopy(rtmp,pv,pj[j],j); 223 diag = diag_offset[i] - ai[i]; 224 /* invert diagonal block */ 225 BlockInvert(pv,diag); 226 } 227 228 PetscFree(rtmp); PetscFree(v_work); 229 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 230 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 231 ierr = ISDestroy(isicol); CHKERRQ(ierr); 232 C->factor = FACTOR_LU; 233 C->assembled = PETSC_TRUE; 234 PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 235 return 0; 236 } 237 /* ----------------------------------------------------------- */ 238 int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f) 239 { 240 Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data; 241 int ierr; 242 Mat C; 243 244 ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr); 245 ierr = MatLUFactorNumeric_SeqBAIJ(A,&C); CHKERRQ(ierr); 246 247 /* free all the data structures from mat */ 248 PetscFree(mat->a); 249 if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);} 250 if (mat->diag) PetscFree(mat->diag); 251 if (mat->ilen) PetscFree(mat->ilen); 252 if (mat->imax) PetscFree(mat->imax); 253 if (mat->solve_work) PetscFree(mat->solve_work); 254 PetscFree(mat); 255 256 PetscMemcpy(A,C,sizeof(struct _Mat)); 257 PetscHeaderDestroy(C); 258 return 0; 259 } 260 /* ----------------------------------------------------------- */ 261 int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx) 262 { 263 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 264 IS iscol = a->col, isrow = a->row; 265 int *r,*c, ierr, i, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 266 int nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1; 267 Scalar *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0; 268 Scalar _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 269 register Scalar *x, *b, *lsum, *tmp, *v; 270 271 if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix"); 272 273 ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba; 274 ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa; 275 tmp = a->solve_work; 276 277 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 278 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1); 279 280 switch (bs) { 281 case 1: 282 /* forward solve the lower triangular */ 283 tmp[0] = b[*r++]; 284 for ( i=1; i<n; i++ ) { 285 v = aa + ai[i]; 286 vi = aj + ai[i]; 287 nz = a->diag[i] - ai[i]; 288 sum1 = b[*r++]; 289 while (nz--) { 290 sum1 -= (*v++)*tmp[*vi++]; 291 } 292 tmp[i] = sum1; 293 } 294 /* backward solve the upper triangular */ 295 for ( i=n-1; i>=0; i-- ){ 296 v = aa + a->diag[i] + 1; 297 vi = aj + a->diag[i] + 1; 298 nz = ai[i+1] - a->diag[i] - 1; 299 sum1 = tmp[i]; 300 while (nz--) { 301 sum1 -= (*v++)*tmp[*vi++]; 302 } 303 x[*c--] = tmp[i] = aa[a->diag[i]]*sum1; 304 } 305 break; 306 case 2: 307 /* forward solve the lower triangular */ 308 idx = 2*(*r++); 309 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 310 for ( i=1; i<n; i++ ) { 311 v = aa + 4*ai[i]; 312 vi = aj + ai[i]; 313 nz = a->diag[i] - ai[i]; 314 idx = 2*(*r++); 315 sum1 = b[idx]; sum2 = b[1+idx]; 316 while (nz--) { 317 idx = 2*(*vi++); 318 x1 = tmp[idx]; x2 = tmp[1+idx]; 319 sum1 -= v[0]*x1 + v[2]*x2; 320 sum2 -= v[1]*x1 + v[3]*x2; 321 v += 4; 322 } 323 idx = 2*i; 324 tmp[idx] = sum1; tmp[1+idx] = sum2; 325 } 326 /* backward solve the upper triangular */ 327 for ( i=n-1; i>=0; i-- ){ 328 v = aa + 4*a->diag[i] + 4; 329 vi = aj + a->diag[i] + 1; 330 nz = ai[i+1] - a->diag[i] - 1; 331 idt = 2*i; 332 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 333 while (nz--) { 334 idx = 2*(*vi++); 335 x1 = tmp[idx]; x2 = tmp[1+idx]; 336 sum1 -= v[0]*x1 + v[2]*x2; 337 sum2 -= v[1]*x1 + v[3]*x2; 338 v += 4; 339 } 340 idc = 2*(*c--); 341 v = aa + 4*a->diag[i]; 342 x[idc] = tmp[idt] = v[0]*sum1 + v[2]*sum2; 343 x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2; 344 } 345 break; 346 case 3: 347 /* forward solve the lower triangular */ 348 idx = 3*(*r++); 349 tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx]; 350 for ( i=1; i<n; i++ ) { 351 v = aa + 9*ai[i]; 352 vi = aj + ai[i]; 353 nz = a->diag[i] - ai[i]; 354 idx = 3*(*r++); 355 sum1 = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx]; 356 while (nz--) { 357 idx = 3*(*vi++); 358 x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 359 sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 360 sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 361 sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 362 v += 9; 363 } 364 idx = 3*i; 365 tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3; 366 } 367 /* backward solve the upper triangular */ 368 for ( i=n-1; i>=0; i-- ){ 369 v = aa + 9*a->diag[i] + 9; 370 vi = aj + a->diag[i] + 1; 371 nz = ai[i+1] - a->diag[i] - 1; 372 idt = 3*i; 373 sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt]; 374 while (nz--) { 375 idx = 3*(*vi++); 376 x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 377 sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 378 sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 379 sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 380 v += 9; 381 } 382 idc = 3*(*c--); 383 v = aa + 9*a->diag[i]; 384 x[idc] = tmp[idt] = v[0]*sum1 + v[3]*sum2 + v[6]*sum3; 385 x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3; 386 x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3; 387 } 388 break; 389 case 4: 390 /* forward solve the lower triangular */ 391 idx = 4*(*r++); 392 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 393 tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; 394 for ( i=1; i<n; i++ ) { 395 v = aa + 16*ai[i]; 396 vi = aj + ai[i]; 397 nz = a->diag[i] - ai[i]; 398 idx = 4*(*r++); 399 sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 400 while (nz--) { 401 idx = 4*(*vi++); 402 x1 = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx]; 403 sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 404 sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 405 sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 406 sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 407 v += 16; 408 } 409 idx = 4*i; 410 tmp[idx] = sum1;tmp[1+idx] = sum2; 411 tmp[2+idx] = sum3;tmp[3+idx] = sum4; 412 } 413 /* backward solve the upper triangular */ 414 for ( i=n-1; i>=0; i-- ){ 415 v = aa + 16*a->diag[i] + 16; 416 vi = aj + a->diag[i] + 1; 417 nz = ai[i+1] - a->diag[i] - 1; 418 idt = 4*i; 419 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 420 sum3 = tmp[2+idt];sum4 = tmp[3+idt]; 421 while (nz--) { 422 idx = 4*(*vi++); 423 x1 = tmp[idx]; x2 = tmp[1+idx]; 424 x3 = tmp[2+idx]; x4 = tmp[3+idx]; 425 sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 426 sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 427 sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 428 sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 429 v += 16; 430 } 431 idc = 4*(*c--); 432 v = aa + 16*a->diag[i]; 433 x[idc] = tmp[idt] = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4; 434 x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4; 435 x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4; 436 x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4; 437 } 438 break; 439 case 5: 440 /* forward solve the lower triangular */ 441 idx = 5*(*r++); 442 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 443 tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx]; 444 for ( i=1; i<n; i++ ) { 445 v = aa + 25*ai[i]; 446 vi = aj + ai[i]; 447 nz = a->diag[i] - ai[i]; 448 idx = 5*(*r++); 449 sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 450 sum5 = b[4+idx]; 451 while (nz--) { 452 idx = 5*(*vi++); 453 x1 = tmp[idx]; x2 = tmp[1+idx];x3 = tmp[2+idx]; 454 x4 = tmp[3+idx];x5 = tmp[4+idx]; 455 sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 456 sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 457 sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 458 sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 459 sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 460 v += 25; 461 } 462 idx = 5*i; 463 tmp[idx] = sum1;tmp[1+idx] = sum2; 464 tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5; 465 } 466 /* backward solve the upper triangular */ 467 for ( i=n-1; i>=0; i-- ){ 468 v = aa + 25*a->diag[i] + 25; 469 vi = aj + a->diag[i] + 1; 470 nz = ai[i+1] - a->diag[i] - 1; 471 idt = 5*i; 472 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 473 sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt]; 474 while (nz--) { 475 idx = 5*(*vi++); 476 x1 = tmp[idx]; x2 = tmp[1+idx]; 477 x3 = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx]; 478 sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 479 sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 480 sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 481 sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 482 sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 483 v += 25; 484 } 485 idc = 5*(*c--); 486 v = aa + 25*a->diag[i]; 487 x[idc] = tmp[idt] = v[0]*sum1+v[5]*sum2+v[10]*sum3+ 488 v[15]*sum4+v[20]*sum5; 489 x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+ 490 v[16]*sum4+v[21]*sum5; 491 x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+ 492 v[17]*sum4+v[22]*sum5; 493 x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+ 494 v[18]*sum4+v[23]*sum5; 495 x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+ 496 v[19]*sum4+v[24]*sum5; 497 } 498 break; 499 default: { 500 /* forward solve the lower triangular */ 501 PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar)); 502 for ( i=1; i<n; i++ ) { 503 v = aa + bs2*ai[i]; 504 vi = aj + ai[i]; 505 nz = a->diag[i] - ai[i]; 506 sum = tmp + bs*i; 507 PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar)); 508 while (nz--) { 509 LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One); 510 v += bs2; 511 } 512 } 513 /* backward solve the upper triangular */ 514 lsum = a->solve_work + a->n; 515 for ( i=n-1; i>=0; i-- ){ 516 v = aa + bs2*(a->diag[i] + 1); 517 vi = aj + a->diag[i] + 1; 518 nz = ai[i+1] - a->diag[i] - 1; 519 PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar)); 520 while (nz--) { 521 LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One); 522 v += bs2; 523 } 524 LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero, 525 tmp+i*bs,&_One); 526 PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar)); 527 } 528 } 529 } 530 531 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 532 ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr); 533 PLogFlops(2*bs2*a->nz - a->n); 534 return 0; 535 } 536 537 /* ----------------------------------------------------------------*/ 538 /* 539 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 540 except that the data structure of Mat_SeqAIJ is slightly different. 541 Not a good example of code reuse. 542 */ 543 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels, 544 Mat *fact) 545 { 546 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 547 IS isicol; 548 int *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j; 549 int *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev; 550 int *dloc, idx, row,m,fm, nzf, nzi,len, realloc = 0; 551 int incrlev,nnz,i,bs = a->bs; 552 553 if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square"); 554 if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation"); 555 if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation"); 556 557 /* special case that simply copies fill pattern */ 558 if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) { 559 ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr); 560 (*fact)->factor = FACTOR_LU; 561 b = (Mat_SeqBAIJ *) (*fact)->data; 562 if (!b->diag) { 563 ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr); 564 } 565 b->row = isrow; 566 b->col = iscol; 567 b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 568 return 0; 569 } 570 571 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 572 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 573 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 574 575 /* get new row pointers */ 576 ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 577 ainew[0] = 0; 578 /* don't know how many column pointers are needed so estimate */ 579 jmax = (int) (f*ai[n] + 1); 580 ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 581 /* ajfill is level of fill for each fill entry */ 582 ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill); 583 /* fill is a linked list of nonzeros in active row */ 584 fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill); 585 /* im is level for each filled value */ 586 im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im); 587 /* dloc is location of diagonal in factor */ 588 dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc); 589 dloc[0] = 0; 590 for ( prow=0; prow<n; prow++ ) { 591 /* first copy previous fill into linked list */ 592 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 593 xi = aj + ai[r[prow]]; 594 fill[n] = n; 595 while (nz--) { 596 fm = n; 597 idx = ic[*xi++]; 598 do { 599 m = fm; 600 fm = fill[m]; 601 } while (fm < idx); 602 fill[m] = idx; 603 fill[idx] = fm; 604 im[idx] = 0; 605 } 606 nzi = 0; 607 row = fill[n]; 608 while ( row < prow ) { 609 incrlev = im[row] + 1; 610 nz = dloc[row]; 611 xi = ajnew + ainew[row] + nz; 612 flev = ajfill + ainew[row] + nz + 1; 613 nnz = ainew[row+1] - ainew[row] - nz - 1; 614 if (*xi++ != row) { 615 SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot"); 616 } 617 fm = row; 618 while (nnz-- > 0) { 619 idx = *xi++; 620 if (*flev + incrlev > levels) { 621 flev++; 622 continue; 623 } 624 do { 625 m = fm; 626 fm = fill[m]; 627 } while (fm < idx); 628 if (fm != idx) { 629 im[idx] = *flev + incrlev; 630 fill[m] = idx; 631 fill[idx] = fm; 632 fm = idx; 633 nzf++; 634 } 635 else { 636 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 637 } 638 flev++; 639 } 640 row = fill[row]; 641 nzi++; 642 } 643 /* copy new filled row into permanent storage */ 644 ainew[prow+1] = ainew[prow] + nzf; 645 if (ainew[prow+1] > jmax) { 646 /* allocate a longer ajnew */ 647 int maxadd; 648 maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); 649 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 650 jmax += maxadd; 651 xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 652 PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int)); 653 PetscFree(ajnew); 654 ajnew = xi; 655 /* allocate a longer ajfill */ 656 xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 657 PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int)); 658 PetscFree(ajfill); 659 ajfill = xi; 660 realloc++; 661 } 662 xi = ajnew + ainew[prow]; 663 flev = ajfill + ainew[prow]; 664 dloc[prow] = nzi; 665 fm = fill[n]; 666 while (nzf--) { 667 *xi++ = fm; 668 *flev++ = im[fm]; 669 fm = fill[fm]; 670 } 671 } 672 PetscFree(ajfill); 673 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 674 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 675 ierr = ISDestroy(isicol); CHKERRQ(ierr); 676 PetscFree(fill); PetscFree(im); 677 678 PLogInfo((PetscObject)A, 679 "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n", 680 realloc,f,((double)ainew[n])/((double)ai[prow])); 681 682 /* put together the new matrix */ 683 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 684 b = (Mat_SeqBAIJ *) (*fact)->data; 685 PetscFree(b->imax); 686 b->singlemalloc = 0; 687 len = bs*bs*ainew[n]*sizeof(Scalar); 688 /* the next line frees the default space generated by the Create() */ 689 PetscFree(b->a); PetscFree(b->ilen); 690 b->a = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a); 691 b->j = ajnew; 692 b->i = ainew; 693 for ( i=0; i<n; i++ ) dloc[i] += ainew[i]; 694 b->diag = dloc; 695 b->ilen = 0; 696 b->imax = 0; 697 b->row = isrow; 698 b->col = iscol; 699 b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 700 CHKPTRQ(b->solve_work); 701 /* In b structure: Free imax, ilen, old a, old j. 702 Allocate dloc, solve_work, new a, new j */ 703 PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar)); 704 b->maxnz = b->nz = ainew[n]; 705 (*fact)->factor = FACTOR_LU; 706 return 0; 707 } 708 709 710 711 712