1 #ifndef lint 2 static char vcid[] = "$Id: baijfact.c,v 1.5 1996/02/19 03:51:17 bsmith Exp curfman $"; 3 #endif 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "baij.h" 9 10 /* 11 The symbolic factorization code is identical to that for AIJ format, 12 except for very small changes since this is now a SeqBAIJ datastructure. 13 NOT good code reuse. 14 */ 15 int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B) 16 { 17 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 18 IS isicol; 19 int *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j; 20 int *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs; 21 int *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im; 22 23 if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square"); 24 if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation"); 25 if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation"); 26 27 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 28 ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic); 29 30 /* get new row pointers */ 31 ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 32 ainew[0] = 0; 33 /* don't know how many column pointers are needed so estimate */ 34 jmax = (int) (f*ai[n] + 1); 35 ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 36 /* fill is a linked list of nonzeros in active row */ 37 fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill); 38 im = fill + n + 1; 39 /* idnew is location of diagonal in factor */ 40 idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew); 41 idnew[0] = 0; 42 43 for ( i=0; i<n; i++ ) { 44 /* first copy previous fill into linked list */ 45 nnz = nz = ai[r[i]+1] - ai[r[i]]; 46 ajtmp = aj + ai[r[i]]; 47 fill[n] = n; 48 while (nz--) { 49 fm = n; 50 idx = ic[*ajtmp++]; 51 do { 52 m = fm; 53 fm = fill[m]; 54 } while (fm < idx); 55 fill[m] = idx; 56 fill[idx] = fm; 57 } 58 row = fill[n]; 59 while ( row < i ) { 60 ajtmp = ajnew + idnew[row] + 1; 61 nzbd = 1 + idnew[row] - ainew[row]; 62 nz = im[row] - nzbd; 63 fm = row; 64 while (nz-- > 0) { 65 idx = *ajtmp++; 66 nzbd++; 67 if (idx == i) im[row] = nzbd; 68 do { 69 m = fm; 70 fm = fill[m]; 71 } while (fm < idx); 72 if (fm != idx) { 73 fill[m] = idx; 74 fill[idx] = fm; 75 fm = idx; 76 nnz++; 77 } 78 } 79 row = fill[row]; 80 } 81 /* copy new filled row into permanent storage */ 82 ainew[i+1] = ainew[i] + nnz; 83 if (ainew[i+1] > jmax+1) { 84 /* allocate a longer ajnew */ 85 int maxadd; 86 maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n); 87 if (maxadd < nnz) maxadd = (n-i)*(nnz+1); 88 jmax += maxadd; 89 ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp); 90 PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int)); 91 PetscFree(ajnew); 92 ajnew = ajtmp; 93 realloc++; /* count how many times we realloc */ 94 } 95 ajtmp = ajnew + ainew[i]; 96 fm = fill[n]; 97 nzi = 0; 98 im[i] = nnz; 99 while (nnz--) { 100 if (fm < i) nzi++; 101 *ajtmp++ = fm; 102 fm = fill[fm]; 103 } 104 idnew[i] = ainew[i] + nzi; 105 } 106 107 PLogInfo((PetscObject)A, 108 "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n", 109 realloc,f,((double)ainew[n])/((double)ai[i])); 110 111 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 112 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 113 114 PetscFree(fill); 115 116 /* put together the new matrix */ 117 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr); 118 PLogObjectParent(*B,isicol); 119 ierr = ISDestroy(isicol); CHKERRQ(ierr); 120 b = (Mat_SeqBAIJ *) (*B)->data; 121 PetscFree(b->imax); 122 b->singlemalloc = 0; 123 len = ainew[n]*sizeof(Scalar); 124 /* the next line frees the default space generated by the Create() */ 125 PetscFree(b->a); PetscFree(b->ilen); 126 b->a = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a); 127 b->j = ajnew; 128 b->i = ainew; 129 b->diag = idnew; 130 b->ilen = 0; 131 b->imax = 0; 132 b->row = isrow; 133 b->col = iscol; 134 b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 135 CHKPTRQ(b->solve_work); 136 /* In b structure: Free imax, ilen, old a, old j. 137 Allocate idnew, solve_work, new a, new j */ 138 PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar))); 139 b->maxnz = b->nz = ainew[n]; 140 141 return 0; 142 } 143 144 #include "pinclude/plapack.h" 145 /* ----------------------------------------------------------- */ 146 int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B) 147 { 148 Mat C = *B; 149 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 150 IS iscol = b->col, isrow = b->row, isicol; 151 int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 152 int *ajtmpold, *ajtmp, nz, row, bslog,info; 153 int *diag_offset=b->diag,diag,bs=a->bs,bs2 = bs*bs,*v_pivots; 154 register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc,*w; 155 Scalar one = 1.0, zero = 0.0, mone = -1.0; 156 register int *pj; 157 158 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 159 PLogObjectParent(*B,isicol); 160 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 161 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 162 rtmp = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 163 164 /* generate work space needed by dense LU factorization */ 165 v_work = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work); 166 multiplier = v_work + bs2; 167 v_pivots = (int *) (multiplier + bs2); 168 169 /* flops in while loop */ 170 bslog = 2*bs*bs2; 171 172 for ( i=0; i<n; i++ ) { 173 nz = ai[i+1] - ai[i]; 174 ajtmp = aj + ai[i]; 175 for ( j=0; j<nz; j++ ) { 176 PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(Scalar)); 177 } 178 /* load in initial (unfactored row) */ 179 nz = a->i[r[i]+1] - a->i[r[i]]; 180 ajtmpold = a->j + a->i[r[i]]; 181 v = a->a + bs2*a->i[r[i]]; 182 for ( j=0; j<nz; j++ ) { 183 PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(Scalar)); 184 } 185 row = *ajtmp++; 186 while (row < i) { 187 pc = rtmp + bs2*row; 188 /* if (*pc) { */ 189 pv = b->a + bs2*diag_offset[row]; 190 pj = b->j + diag_offset[row] + 1; 191 BLgemm_("N","N",&bs,&bs,&bs,&one,pc,&bs,pv,&bs,&zero, 192 multiplier,&bs); 193 PetscMemcpy(pc,multiplier,bs2*sizeof(Scalar)); 194 nz = ai[row+1] - diag_offset[row] - 1; 195 pv += bs2; 196 for (j=0; j<nz; j++) { 197 BLgemm_("N","N",&bs,&bs,&bs,&mone,multiplier,&bs,pv+bs2*j,&bs, 198 &one,rtmp+bs2*pj[j],&bs); 199 } 200 PLogFlops(bslog*(nz+1)-bs); 201 /* } */ 202 row = *ajtmp++; 203 } 204 /* finished row so stick it into b->a */ 205 pv = b->a + bs2*ai[i]; 206 pj = b->j + ai[i]; 207 nz = ai[i+1] - ai[i]; 208 for ( j=0; j<nz; j++ ) { 209 PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(Scalar)); 210 } 211 diag = diag_offset[i] - ai[i]; 212 /* invert diagonal block */ 213 w = pv + bs2*diag; 214 LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); 215 PetscMemzero(v_work,bs2*sizeof(Scalar)); 216 for ( j=0; j<bs; j++ ) { v_work[j + bs*j] = 1.0; } 217 LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info); 218 PetscMemcpy(w,v_work,bs2*sizeof(Scalar)); 219 } 220 221 PetscFree(rtmp); PetscFree(v_work); 222 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 223 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 224 ierr = ISDestroy(isicol); CHKERRQ(ierr); 225 C->factor = FACTOR_LU; 226 C->assembled = PETSC_TRUE; 227 PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */ 228 return 0; 229 } 230 /* ------------------------------------------------------------*/ 231 /* 232 Version for when blocks are 2 by 2 233 */ 234 int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B) 235 { 236 Mat C = *B; 237 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data; 238 IS iscol = b->col, isrow = b->row, isicol; 239 int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 240 int *ajtmpold, *ajtmp, nz, row, info; 241 int *diag_offset=b->diag,*v_pivots,bs = 2,idx; 242 register Scalar *pv,*v,*rtmp,m1,m2,m3,m4,*v_work,*pc,*w,*x,x1,x2,x3,x4; 243 Scalar p1,p2,p3,p4; 244 register int *pj; 245 246 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 247 PLogObjectParent(*B,isicol); 248 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 249 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 250 rtmp = (Scalar *) PetscMalloc(4*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 251 252 /* generate work space needed by dense LU factorization */ 253 v_work = (Scalar *) PetscMalloc(6*sizeof(Scalar));CHKPTRQ(v_work); 254 v_pivots = (int *) (v_work + 4); 255 256 for ( i=0; i<n; i++ ) { 257 nz = ai[i+1] - ai[i]; 258 ajtmp = aj + ai[i]; 259 for ( j=0; j<nz; j++ ) { 260 x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0; 261 } 262 /* load in initial (unfactored row) */ 263 idx = r[i]; 264 nz = a->i[idx+1] - a->i[idx]; 265 ajtmpold = a->j + a->i[idx]; 266 v = a->a + 4*a->i[idx]; 267 for ( j=0; j<nz; j++ ) { 268 x = rtmp+4*ic[ajtmpold[j]]; 269 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 270 v += 4; 271 } 272 row = *ajtmp++; 273 while (row < i) { 274 pc = rtmp + 4*row; 275 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 276 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 277 pv = b->a + 4*diag_offset[row]; 278 pj = b->j + diag_offset[row] + 1; 279 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 280 pc[0] = m1 = p1*x1 + p3*x2; 281 pc[1] = m2 = p2*x1 + p4*x2; 282 pc[2] = m3 = p1*x3 + p3*x4; 283 pc[3] = m4 = p2*x3 + p4*x4; 284 nz = ai[row+1] - diag_offset[row] - 1; 285 pv += 4; 286 for (j=0; j<nz; j++) { 287 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 288 x = rtmp + 4*pj[j]; 289 x[0] -= m1*x1 + m3*x2; 290 x[1] -= m2*x1 + m4*x2; 291 x[2] -= m1*x3 + m3*x4; 292 x[3] -= m2*x3 + m4*x4; 293 pv += 4; 294 } 295 PLogFlops(16*nz+12); 296 } 297 row = *ajtmp++; 298 } 299 /* finished row so stick it into b->a */ 300 pv = b->a + 4*ai[i]; 301 pj = b->j + ai[i]; 302 nz = ai[i+1] - ai[i]; 303 for ( j=0; j<nz; j++ ) { 304 x = rtmp+4*pj[j]; 305 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 306 pv += 4; 307 } 308 /* invert diagonal block */ 309 w = b->a + 4*diag_offset[i]; 310 LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); 311 v_work[0] = 1.0; v_work[1] = 0.0; v_work[2] = 0.0; v_work[3] = 1.0; 312 LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info); 313 w[0] = v_work[0]; w[1] = v_work[1]; w[2] = v_work[2]; w[3] = v_work[3]; 314 } 315 316 PetscFree(rtmp); PetscFree(v_work); 317 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 318 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 319 ierr = ISDestroy(isicol); CHKERRQ(ierr); 320 C->factor = FACTOR_LU; 321 C->assembled = PETSC_TRUE; 322 PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 323 return 0; 324 } 325 326 /* ----------------------------------------------------------- */ 327 /* 328 Version for when blocks are 1 by 1. 329 */ 330 int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B) 331 { 332 Mat C = *B; 333 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data; 334 IS iscol = b->col, isrow = b->row, isicol; 335 int *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j; 336 int *ajtmpold, *ajtmp, nz, row; 337 int *diag_offset = b->diag,diag; 338 register Scalar *pv,*v,*rtmp,multiplier,*pc; 339 register int *pj; 340 341 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 342 PLogObjectParent(*B,isicol); 343 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 344 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 345 rtmp = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp); 346 347 for ( i=0; i<n; i++ ) { 348 nz = ai[i+1] - ai[i]; 349 ajtmp = aj + ai[i]; 350 for ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0; 351 352 /* load in initial (unfactored row) */ 353 nz = a->i[r[i]+1] - a->i[r[i]]; 354 ajtmpold = a->j + a->i[r[i]]; 355 v = a->a + a->i[r[i]]; 356 for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] = v[j]; 357 358 row = *ajtmp++; 359 while (row < i) { 360 pc = rtmp + row; 361 if (*pc != 0.0) { 362 pv = b->a + diag_offset[row]; 363 pj = b->j + diag_offset[row] + 1; 364 multiplier = *pc * *pv++; 365 *pc = multiplier; 366 nz = ai[row+1] - diag_offset[row] - 1; 367 for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 368 PLogFlops(1+2*nz); 369 } 370 row = *ajtmp++; 371 } 372 /* finished row so stick it into b->a */ 373 pv = b->a + ai[i]; 374 pj = b->j + ai[i]; 375 nz = ai[i+1] - ai[i]; 376 for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];} 377 diag = diag_offset[i] - ai[i]; 378 /* check pivot entry for current row */ 379 if (pv[diag] == 0.0) { 380 SETERRQ(1,"MatLUFactorNumeric_SeqAIJ:Zero pivot"); 381 } 382 pv[diag] = 1.0/pv[diag]; 383 } 384 385 PetscFree(rtmp); 386 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 387 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 388 ierr = ISDestroy(isicol); CHKERRQ(ierr); 389 C->factor = FACTOR_LU; 390 C->assembled = PETSC_TRUE; 391 PLogFlops(b->n); 392 return 0; 393 } 394 395 /* ----------------------------------------------------------- */ 396 int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f) 397 { 398 Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data; 399 int ierr; 400 Mat C; 401 402 ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr); 403 ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr); 404 405 /* free all the data structures from mat */ 406 PetscFree(mat->a); 407 if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);} 408 if (mat->diag) PetscFree(mat->diag); 409 if (mat->ilen) PetscFree(mat->ilen); 410 if (mat->imax) PetscFree(mat->imax); 411 if (mat->solve_work) PetscFree(mat->solve_work); 412 PetscFree(mat); 413 414 PetscMemcpy(A,C,sizeof(struct _Mat)); 415 PetscHeaderDestroy(C); 416 return 0; 417 } 418 /* ----------------------------------------------------------- */ 419 int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx) 420 { 421 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data; 422 IS iscol = a->col, isrow = a->row; 423 int *r,*c, ierr, i, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 424 int nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1; 425 Scalar *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0; 426 Scalar _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 427 register Scalar *x, *b, *lsum, *tmp, *v; 428 429 if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix"); 430 431 ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba; 432 ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa; 433 tmp = a->solve_work; 434 435 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 436 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1); 437 438 switch (bs) { 439 case 1: 440 /* forward solve the lower triangular */ 441 tmp[0] = b[*r++]; 442 for ( i=1; i<n; i++ ) { 443 v = aa + ai[i]; 444 vi = aj + ai[i]; 445 nz = a->diag[i] - ai[i]; 446 sum1 = b[*r++]; 447 while (nz--) { 448 sum1 -= (*v++)*tmp[*vi++]; 449 } 450 tmp[i] = sum1; 451 } 452 /* backward solve the upper triangular */ 453 for ( i=n-1; i>=0; i-- ){ 454 v = aa + a->diag[i] + 1; 455 vi = aj + a->diag[i] + 1; 456 nz = ai[i+1] - a->diag[i] - 1; 457 sum1 = tmp[i]; 458 while (nz--) { 459 sum1 -= (*v++)*tmp[*vi++]; 460 } 461 x[*c--] = tmp[i] = aa[a->diag[i]]*sum1; 462 } 463 break; 464 case 2: 465 /* forward solve the lower triangular */ 466 idx = 2*(*r++); 467 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 468 for ( i=1; i<n; i++ ) { 469 v = aa + 4*ai[i]; 470 vi = aj + ai[i]; 471 nz = a->diag[i] - ai[i]; 472 idx = 2*(*r++); 473 sum1 = b[idx]; sum2 = b[1+idx]; 474 while (nz--) { 475 idx = 2*(*vi++); 476 x1 = tmp[idx]; x2 = tmp[1+idx]; 477 sum1 -= v[0]*x1 + v[2]*x2; 478 sum2 -= v[1]*x1 + v[3]*x2; 479 v += 4; 480 } 481 idx = 2*i; 482 tmp[idx] = sum1; tmp[1+idx] = sum2; 483 } 484 /* backward solve the upper triangular */ 485 for ( i=n-1; i>=0; i-- ){ 486 v = aa + 4*a->diag[i] + 4; 487 vi = aj + a->diag[i] + 1; 488 nz = ai[i+1] - a->diag[i] - 1; 489 idt = 2*i; 490 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 491 while (nz--) { 492 idx = 2*(*vi++); 493 x1 = tmp[idx]; x2 = tmp[1+idx]; 494 sum1 -= v[0]*x1 + v[2]*x2; 495 sum2 -= v[1]*x1 + v[3]*x2; 496 v += 4; 497 } 498 idc = 2*(*c--); 499 v = aa + 4*a->diag[i]; 500 x[idc] = tmp[idt] = v[0]*sum1 + v[2]*sum2; 501 x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2; 502 } 503 break; 504 case 3: 505 /* forward solve the lower triangular */ 506 idx = 3*(*r++); 507 tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx]; 508 for ( i=1; i<n; i++ ) { 509 v = aa + 9*ai[i]; 510 vi = aj + ai[i]; 511 nz = a->diag[i] - ai[i]; 512 idx = 3*(*r++); 513 sum1 = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx]; 514 while (nz--) { 515 idx = 3*(*vi++); 516 x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 517 sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 518 sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 519 sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 520 v += 9; 521 } 522 idx = 3*i; 523 tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3; 524 } 525 /* backward solve the upper triangular */ 526 for ( i=n-1; i>=0; i-- ){ 527 v = aa + 9*a->diag[i] + 9; 528 vi = aj + a->diag[i] + 1; 529 nz = ai[i+1] - a->diag[i] - 1; 530 idt = 3*i; 531 sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt]; 532 while (nz--) { 533 idx = 3*(*vi++); 534 x1 = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx]; 535 sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 536 sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 537 sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 538 v += 9; 539 } 540 idc = 3*(*c--); 541 v = aa + 9*a->diag[i]; 542 x[idc] = tmp[idt] = v[0]*sum1 + v[3]*sum2 + v[6]*sum3; 543 x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3; 544 x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3; 545 } 546 break; 547 case 4: 548 /* forward solve the lower triangular */ 549 idx = 4*(*r++); 550 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 551 tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; 552 for ( i=1; i<n; i++ ) { 553 v = aa + 16*ai[i]; 554 vi = aj + ai[i]; 555 nz = a->diag[i] - ai[i]; 556 idx = 4*(*r++); 557 sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 558 while (nz--) { 559 idx = 4*(*vi++); 560 x1 = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx]; 561 sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 562 sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 563 sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 564 sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 565 v += 16; 566 } 567 idx = 4*i; 568 tmp[idx] = sum1;tmp[1+idx] = sum2; 569 tmp[2+idx] = sum3;tmp[3+idx] = sum4; 570 } 571 /* backward solve the upper triangular */ 572 for ( i=n-1; i>=0; i-- ){ 573 v = aa + 16*a->diag[i] + 16; 574 vi = aj + a->diag[i] + 1; 575 nz = ai[i+1] - a->diag[i] - 1; 576 idt = 4*i; 577 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 578 sum3 = tmp[2+idt];sum4 = tmp[3+idt]; 579 while (nz--) { 580 idx = 4*(*vi++); 581 x1 = tmp[idx]; x2 = tmp[1+idx]; 582 x3 = tmp[2+idx]; x4 = tmp[3+idx]; 583 sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 584 sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 585 sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 586 sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 587 v += 16; 588 } 589 idc = 4*(*c--); 590 v = aa + 16*a->diag[i]; 591 x[idc] = tmp[idt] = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4; 592 x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4; 593 x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4; 594 x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4; 595 } 596 break; 597 case 5: 598 /* forward solve the lower triangular */ 599 idx = 5*(*r++); 600 tmp[0] = b[idx]; tmp[1] = b[1+idx]; 601 tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx]; 602 for ( i=1; i<n; i++ ) { 603 v = aa + 25*ai[i]; 604 vi = aj + ai[i]; 605 nz = a->diag[i] - ai[i]; 606 idx = 5*(*r++); 607 sum1 = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx]; 608 sum5 = b[4+idx]; 609 while (nz--) { 610 idx = 5*(*vi++); 611 x1 = tmp[idx]; x2 = tmp[1+idx];x3 = tmp[2+idx]; 612 x4 = tmp[3+idx];x5 = tmp[4+idx]; 613 sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 614 sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 615 sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 616 sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 617 sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 618 v += 25; 619 } 620 idx = 5*i; 621 tmp[idx] = sum1;tmp[1+idx] = sum2; 622 tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5; 623 } 624 /* backward solve the upper triangular */ 625 for ( i=n-1; i>=0; i-- ){ 626 v = aa + 25*a->diag[i] + 25; 627 vi = aj + a->diag[i] + 1; 628 nz = ai[i+1] - a->diag[i] - 1; 629 idt = 5*i; 630 sum1 = tmp[idt]; sum2 = tmp[1+idt]; 631 sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt]; 632 while (nz--) { 633 idx = 5*(*vi++); 634 x1 = tmp[idx]; x2 = tmp[1+idx]; 635 x3 = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx]; 636 sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 637 sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 638 sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 639 sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 640 sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 641 v += 25; 642 } 643 idc = 5*(*c--); 644 v = aa + 25*a->diag[i]; 645 x[idc] = tmp[idt] = v[0]*sum1+v[5]*sum2+v[10]*sum3+ 646 v[15]*sum4+v[20]*sum5; 647 x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+ 648 v[16]*sum4+v[21]*sum5; 649 x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+ 650 v[17]*sum4+v[22]*sum5; 651 x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+ 652 v[18]*sum4+v[23]*sum5; 653 x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+ 654 v[19]*sum4+v[24]*sum5; 655 } 656 break; 657 default: { 658 /* forward solve the lower triangular */ 659 PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar)); 660 for ( i=1; i<n; i++ ) { 661 v = aa + bs2*ai[i]; 662 vi = aj + ai[i]; 663 nz = a->diag[i] - ai[i]; 664 sum = tmp + bs*i; 665 PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar)); 666 while (nz--) { 667 LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One); 668 v += bs2; 669 } 670 } 671 /* backward solve the upper triangular */ 672 lsum = a->solve_work + a->n; 673 for ( i=n-1; i>=0; i-- ){ 674 v = aa + bs2*(a->diag[i] + 1); 675 vi = aj + a->diag[i] + 1; 676 nz = ai[i+1] - a->diag[i] - 1; 677 PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar)); 678 while (nz--) { 679 LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One); 680 v += bs2; 681 } 682 LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero, 683 tmp+i*bs,&_One); 684 PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar)); 685 } 686 } 687 } 688 689 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 690 ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr); 691 PLogFlops(2*bs2*a->nz - a->n); 692 return 0; 693 } 694 695 /* ----------------------------------------------------------------*/ 696 /* 697 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 698 except that the data structure of Mat_SeqAIJ is slightly different. 699 Not a good example of code reuse. 700 */ 701 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels, 702 Mat *fact) 703 { 704 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b; 705 IS isicol; 706 int *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j; 707 int *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev; 708 int *dloc, idx, row,m,fm, nzf, nzi,len, realloc = 0; 709 int incrlev,nnz,i,bs = a->bs; 710 711 if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square"); 712 if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation"); 713 if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation"); 714 715 /* special case that simply copies fill pattern */ 716 if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) { 717 ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr); 718 (*fact)->factor = FACTOR_LU; 719 b = (Mat_SeqBAIJ *) (*fact)->data; 720 if (!b->diag) { 721 ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr); 722 } 723 b->row = isrow; 724 b->col = iscol; 725 b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 726 return 0; 727 } 728 729 ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr); 730 ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr); 731 ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr); 732 733 /* get new row pointers */ 734 ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew); 735 ainew[0] = 0; 736 /* don't know how many column pointers are needed so estimate */ 737 jmax = (int) (f*ai[n] + 1); 738 ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew); 739 /* ajfill is level of fill for each fill entry */ 740 ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill); 741 /* fill is a linked list of nonzeros in active row */ 742 fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill); 743 /* im is level for each filled value */ 744 im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im); 745 /* dloc is location of diagonal in factor */ 746 dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc); 747 dloc[0] = 0; 748 for ( prow=0; prow<n; prow++ ) { 749 /* first copy previous fill into linked list */ 750 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 751 xi = aj + ai[r[prow]]; 752 fill[n] = n; 753 while (nz--) { 754 fm = n; 755 idx = ic[*xi++]; 756 do { 757 m = fm; 758 fm = fill[m]; 759 } while (fm < idx); 760 fill[m] = idx; 761 fill[idx] = fm; 762 im[idx] = 0; 763 } 764 nzi = 0; 765 row = fill[n]; 766 while ( row < prow ) { 767 incrlev = im[row] + 1; 768 nz = dloc[row]; 769 xi = ajnew + ainew[row] + nz; 770 flev = ajfill + ainew[row] + nz + 1; 771 nnz = ainew[row+1] - ainew[row] - nz - 1; 772 if (*xi++ != row) { 773 SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot"); 774 } 775 fm = row; 776 while (nnz-- > 0) { 777 idx = *xi++; 778 if (*flev + incrlev > levels) { 779 flev++; 780 continue; 781 } 782 do { 783 m = fm; 784 fm = fill[m]; 785 } while (fm < idx); 786 if (fm != idx) { 787 im[idx] = *flev + incrlev; 788 fill[m] = idx; 789 fill[idx] = fm; 790 fm = idx; 791 nzf++; 792 } 793 else { 794 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 795 } 796 flev++; 797 } 798 row = fill[row]; 799 nzi++; 800 } 801 /* copy new filled row into permanent storage */ 802 ainew[prow+1] = ainew[prow] + nzf; 803 if (ainew[prow+1] > jmax) { 804 /* allocate a longer ajnew */ 805 int maxadd; 806 maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); 807 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 808 jmax += maxadd; 809 xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 810 PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int)); 811 PetscFree(ajnew); 812 ajnew = xi; 813 /* allocate a longer ajfill */ 814 xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi); 815 PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int)); 816 PetscFree(ajfill); 817 ajfill = xi; 818 realloc++; 819 } 820 xi = ajnew + ainew[prow]; 821 flev = ajfill + ainew[prow]; 822 dloc[prow] = nzi; 823 fm = fill[n]; 824 while (nzf--) { 825 *xi++ = fm; 826 *flev++ = im[fm]; 827 fm = fill[fm]; 828 } 829 } 830 PetscFree(ajfill); 831 ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr); 832 ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr); 833 ierr = ISDestroy(isicol); CHKERRQ(ierr); 834 PetscFree(fill); PetscFree(im); 835 836 PLogInfo((PetscObject)A, 837 "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n", 838 realloc,f,((double)ainew[n])/((double)ai[prow])); 839 840 /* put together the new matrix */ 841 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 842 b = (Mat_SeqBAIJ *) (*fact)->data; 843 PetscFree(b->imax); 844 b->singlemalloc = 0; 845 len = bs*bs*ainew[n]*sizeof(Scalar); 846 /* the next line frees the default space generated by the Create() */ 847 PetscFree(b->a); PetscFree(b->ilen); 848 b->a = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a); 849 b->j = ajnew; 850 b->i = ainew; 851 for ( i=0; i<n; i++ ) dloc[i] += ainew[i]; 852 b->diag = dloc; 853 b->ilen = 0; 854 b->imax = 0; 855 b->row = isrow; 856 b->col = iscol; 857 b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar)); 858 CHKPTRQ(b->solve_work); 859 /* In b structure: Free imax, ilen, old a, old j. 860 Allocate dloc, solve_work, new a, new j */ 861 PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar)); 862 b->maxnz = b->nz = ainew[n]; 863 (*fact)->factor = FACTOR_LU; 864 return 0; 865 } 866 867 868 869 870