xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision 4eeb42bc6e4cc6494252ded34f42f219207a425b)
15d517e7eSBarry Smith #ifndef lint
2*4eeb42bcSBarry Smith static char vcid[] = "$Id: baijfact.c,v 1.4 1996/02/18 00:40:22 bsmith Exp bsmith $";
35d517e7eSBarry Smith #endif
45d517e7eSBarry Smith /*
5ec3a8b7bSBarry Smith     Factorization code for BAIJ format.
65d517e7eSBarry Smith */
75d517e7eSBarry Smith 
8ec3a8b7bSBarry Smith #include "baij.h"
9ec3a8b7bSBarry Smith 
10ec3a8b7bSBarry Smith /*
11ec3a8b7bSBarry Smith     The symbolic factorization code is identical to that for AIJ format,
12ec3a8b7bSBarry Smith   except for very small changes since this is now a SeqBAIJ datastructure.
13ec3a8b7bSBarry Smith   NOT good code reuse.
14ec3a8b7bSBarry Smith */
15ec3a8b7bSBarry Smith int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B)
165d517e7eSBarry Smith {
17ec3a8b7bSBarry Smith   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
185d517e7eSBarry Smith   IS          isicol;
19ec3a8b7bSBarry Smith   int         *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j;
20de6a44a3SBarry Smith   int         *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs;
215d517e7eSBarry Smith   int         *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im;
225d517e7eSBarry Smith 
23ec3a8b7bSBarry Smith   if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square");
24ec3a8b7bSBarry Smith   if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation");
25ec3a8b7bSBarry Smith   if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation");
265d517e7eSBarry Smith 
275d517e7eSBarry Smith   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
285d517e7eSBarry Smith   ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic);
295d517e7eSBarry Smith 
305d517e7eSBarry Smith   /* get new row pointers */
315d517e7eSBarry Smith   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
32de6a44a3SBarry Smith   ainew[0] = 0;
335d517e7eSBarry Smith   /* don't know how many column pointers are needed so estimate */
34de6a44a3SBarry Smith   jmax = (int) (f*ai[n] + 1);
355d517e7eSBarry Smith   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
365d517e7eSBarry Smith   /* fill is a linked list of nonzeros in active row */
375d517e7eSBarry Smith   fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill);
385d517e7eSBarry Smith   im = fill + n + 1;
395d517e7eSBarry Smith   /* idnew is location of diagonal in factor */
405d517e7eSBarry Smith   idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew);
41de6a44a3SBarry Smith   idnew[0] = 0;
425d517e7eSBarry Smith 
435d517e7eSBarry Smith   for ( i=0; i<n; i++ ) {
445d517e7eSBarry Smith     /* first copy previous fill into linked list */
455d517e7eSBarry Smith     nnz     = nz    = ai[r[i]+1] - ai[r[i]];
46de6a44a3SBarry Smith     ajtmp   = aj + ai[r[i]];
475d517e7eSBarry Smith     fill[n] = n;
485d517e7eSBarry Smith     while (nz--) {
495d517e7eSBarry Smith       fm  = n;
50de6a44a3SBarry Smith       idx = ic[*ajtmp++];
515d517e7eSBarry Smith       do {
525d517e7eSBarry Smith         m  = fm;
535d517e7eSBarry Smith         fm = fill[m];
545d517e7eSBarry Smith       } while (fm < idx);
555d517e7eSBarry Smith       fill[m]   = idx;
565d517e7eSBarry Smith       fill[idx] = fm;
575d517e7eSBarry Smith     }
585d517e7eSBarry Smith     row = fill[n];
595d517e7eSBarry Smith     while ( row < i ) {
60de6a44a3SBarry Smith       ajtmp = ajnew + idnew[row] + 1;
615d517e7eSBarry Smith       nzbd  = 1 + idnew[row] - ainew[row];
625d517e7eSBarry Smith       nz    = im[row] - nzbd;
635d517e7eSBarry Smith       fm    = row;
645d517e7eSBarry Smith       while (nz-- > 0) {
65de6a44a3SBarry Smith         idx = *ajtmp++;
665d517e7eSBarry Smith         nzbd++;
675d517e7eSBarry Smith         if (idx == i) im[row] = nzbd;
685d517e7eSBarry Smith         do {
695d517e7eSBarry Smith           m  = fm;
705d517e7eSBarry Smith           fm = fill[m];
715d517e7eSBarry Smith         } while (fm < idx);
725d517e7eSBarry Smith         if (fm != idx) {
735d517e7eSBarry Smith           fill[m]   = idx;
745d517e7eSBarry Smith           fill[idx] = fm;
755d517e7eSBarry Smith           fm        = idx;
765d517e7eSBarry Smith           nnz++;
775d517e7eSBarry Smith         }
785d517e7eSBarry Smith       }
795d517e7eSBarry Smith       row = fill[row];
805d517e7eSBarry Smith     }
815d517e7eSBarry Smith     /* copy new filled row into permanent storage */
825d517e7eSBarry Smith     ainew[i+1] = ainew[i] + nnz;
835d517e7eSBarry Smith     if (ainew[i+1] > jmax+1) {
845d517e7eSBarry Smith       /* allocate a longer ajnew */
855d517e7eSBarry Smith       int maxadd;
86de6a44a3SBarry Smith       maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n);
875d517e7eSBarry Smith       if (maxadd < nnz) maxadd = (n-i)*(nnz+1);
885d517e7eSBarry Smith       jmax += maxadd;
895d517e7eSBarry Smith       ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp);
90de6a44a3SBarry Smith       PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int));
915d517e7eSBarry Smith       PetscFree(ajnew);
925d517e7eSBarry Smith       ajnew = ajtmp;
935d517e7eSBarry Smith       realloc++; /* count how many times we realloc */
945d517e7eSBarry Smith     }
95de6a44a3SBarry Smith     ajtmp = ajnew + ainew[i];
965d517e7eSBarry Smith     fm    = fill[n];
975d517e7eSBarry Smith     nzi   = 0;
985d517e7eSBarry Smith     im[i] = nnz;
995d517e7eSBarry Smith     while (nnz--) {
1005d517e7eSBarry Smith       if (fm < i) nzi++;
101de6a44a3SBarry Smith       *ajtmp++ = fm;
1025d517e7eSBarry Smith       fm       = fill[fm];
1035d517e7eSBarry Smith     }
1045d517e7eSBarry Smith     idnew[i] = ainew[i] + nzi;
1055d517e7eSBarry Smith   }
1065d517e7eSBarry Smith 
1075d517e7eSBarry Smith   PLogInfo((PetscObject)A,
108ec3a8b7bSBarry Smith     "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
1095d517e7eSBarry Smith                              realloc,f,((double)ainew[n])/((double)ai[i]));
1105d517e7eSBarry Smith 
1115d517e7eSBarry Smith   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
1125d517e7eSBarry Smith   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
1135d517e7eSBarry Smith 
1145d517e7eSBarry Smith   PetscFree(fill);
1155d517e7eSBarry Smith 
1165d517e7eSBarry Smith   /* put together the new matrix */
117ec3a8b7bSBarry Smith   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr);
1185d517e7eSBarry Smith   PLogObjectParent(*B,isicol);
1195d517e7eSBarry Smith   ierr = ISDestroy(isicol); CHKERRQ(ierr);
120ec3a8b7bSBarry Smith   b = (Mat_SeqBAIJ *) (*B)->data;
1215d517e7eSBarry Smith   PetscFree(b->imax);
1225d517e7eSBarry Smith   b->singlemalloc = 0;
123de6a44a3SBarry Smith   len             = ainew[n]*sizeof(Scalar);
1245d517e7eSBarry Smith   /* the next line frees the default space generated by the Create() */
1255d517e7eSBarry Smith   PetscFree(b->a); PetscFree(b->ilen);
126ec3a8b7bSBarry Smith   b->a          = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a);
1275d517e7eSBarry Smith   b->j          = ajnew;
1285d517e7eSBarry Smith   b->i          = ainew;
1295d517e7eSBarry Smith   b->diag       = idnew;
1305d517e7eSBarry Smith   b->ilen       = 0;
1315d517e7eSBarry Smith   b->imax       = 0;
1325d517e7eSBarry Smith   b->row        = isrow;
1335d517e7eSBarry Smith   b->col        = iscol;
134de6a44a3SBarry Smith   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
1355d517e7eSBarry Smith   CHKPTRQ(b->solve_work);
1365d517e7eSBarry Smith   /* In b structure:  Free imax, ilen, old a, old j.
1375d517e7eSBarry Smith      Allocate idnew, solve_work, new a, new j */
138de6a44a3SBarry Smith   PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar)));
139de6a44a3SBarry Smith   b->maxnz = b->nz = ainew[n];
1405d517e7eSBarry Smith 
1415d517e7eSBarry Smith   return 0;
1425d517e7eSBarry Smith }
1435d517e7eSBarry Smith 
144ec3a8b7bSBarry Smith #include "pinclude/plapack.h"
145ec3a8b7bSBarry Smith /* ----------------------------------------------------------- */
1467fc0212eSBarry Smith int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B)
1475d517e7eSBarry Smith {
1485d517e7eSBarry Smith   Mat             C = *B;
149ec3a8b7bSBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
1505d517e7eSBarry Smith   IS              iscol = b->col, isrow = b->row, isicol;
151ec3a8b7bSBarry Smith   int             *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
152*4eeb42bcSBarry Smith   int             *ajtmpold, *ajtmp, nz, row, bslog,info;
153ec3a8b7bSBarry Smith   int             *diag_offset=b->diag,diag,bs=a->bs,bs2 = bs*bs,*v_pivots;
154*4eeb42bcSBarry Smith   register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc,*w;
155*4eeb42bcSBarry Smith   Scalar          one = 1.0, zero = 0.0, mone = -1.0;
1565d517e7eSBarry Smith   register int    *pj;
1575d517e7eSBarry Smith 
1585d517e7eSBarry Smith   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
1595d517e7eSBarry Smith   PLogObjectParent(*B,isicol);
1605d517e7eSBarry Smith   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
1615d517e7eSBarry Smith   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
162ec3a8b7bSBarry Smith   rtmp  = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
1635d517e7eSBarry Smith 
164ec3a8b7bSBarry Smith   /* generate work space needed by dense LU factorization */
165de6a44a3SBarry Smith   v_work     = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work);
166de6a44a3SBarry Smith   multiplier = v_work + bs2;
167de6a44a3SBarry Smith   v_pivots   = (int *) (multiplier + bs2);
168de6a44a3SBarry Smith 
169de6a44a3SBarry Smith   /* flops in while loop */
170*4eeb42bcSBarry Smith   bslog = 2*bs*bs2;
1715d517e7eSBarry Smith 
1725d517e7eSBarry Smith   for ( i=0; i<n; i++ ) {
1735d517e7eSBarry Smith     nz    = ai[i+1] - ai[i];
174de6a44a3SBarry Smith     ajtmp = aj + ai[i];
175*4eeb42bcSBarry Smith     for  ( j=0; j<nz; j++ ) {
176*4eeb42bcSBarry Smith       PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(Scalar));
177*4eeb42bcSBarry Smith     }
1785d517e7eSBarry Smith     /* load in initial (unfactored row) */
1795d517e7eSBarry Smith     nz       = a->i[r[i]+1] - a->i[r[i]];
180de6a44a3SBarry Smith     ajtmpold = a->j + a->i[r[i]];
181de6a44a3SBarry Smith     v        = a->a + bs2*a->i[r[i]];
182*4eeb42bcSBarry Smith     for ( j=0; j<nz; j++ ) {
183*4eeb42bcSBarry Smith       PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(Scalar));
184*4eeb42bcSBarry Smith     }
185de6a44a3SBarry Smith     row = *ajtmp++;
1865d517e7eSBarry Smith     while (row < i) {
187de6a44a3SBarry Smith       pc = rtmp + bs2*row;
188de6a44a3SBarry Smith /*      if (*pc) { */
189de6a44a3SBarry Smith         pv = b->a + bs2*diag_offset[row];
190de6a44a3SBarry Smith         pj = b->j + diag_offset[row] + 1;
191*4eeb42bcSBarry Smith         BLgemm_("N","N",&bs,&bs,&bs,&one,pc,&bs,pv,&bs,&zero,
192*4eeb42bcSBarry Smith                 multiplier,&bs);
193*4eeb42bcSBarry Smith         PetscMemcpy(pc,multiplier,bs2*sizeof(Scalar));
1945d517e7eSBarry Smith         nz = ai[row+1] - diag_offset[row] - 1;
195ec3a8b7bSBarry Smith         pv += bs2;
196*4eeb42bcSBarry Smith         for (j=0; j<nz; j++) {
197*4eeb42bcSBarry Smith           BLgemm_("N","N",&bs,&bs,&bs,&mone,multiplier,&bs,pv+bs2*j,&bs,
198*4eeb42bcSBarry Smith                   &one,rtmp+bs2*pj[j],&bs);
199*4eeb42bcSBarry Smith         }
200*4eeb42bcSBarry Smith         PLogFlops(bslog*(nz+1)-bs);
201de6a44a3SBarry Smith /*      } */
202de6a44a3SBarry Smith         row = *ajtmp++;
2035d517e7eSBarry Smith     }
2045d517e7eSBarry Smith     /* finished row so stick it into b->a */
205de6a44a3SBarry Smith     pv = b->a + bs2*ai[i];
206de6a44a3SBarry Smith     pj = b->j + ai[i];
2075d517e7eSBarry Smith     nz = ai[i+1] - ai[i];
208*4eeb42bcSBarry Smith     for ( j=0; j<nz; j++ ) {
209*4eeb42bcSBarry Smith       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(Scalar));
210*4eeb42bcSBarry Smith     }
2115d517e7eSBarry Smith     diag = diag_offset[i] - ai[i];
212ec3a8b7bSBarry Smith     /* invert diagonal block */
213*4eeb42bcSBarry Smith     w = pv + bs2*diag;
214*4eeb42bcSBarry Smith     LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info);
215*4eeb42bcSBarry Smith     PetscMemzero(v_work,bs2*sizeof(Scalar));
216*4eeb42bcSBarry Smith     for ( j=0; j<bs; j++ ) { v_work[j + bs*j] = 1.0; }
217*4eeb42bcSBarry Smith     LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);
218*4eeb42bcSBarry Smith     PetscMemcpy(w,v_work,bs2*sizeof(Scalar));
2195d517e7eSBarry Smith   }
2205d517e7eSBarry Smith 
221de6a44a3SBarry Smith   PetscFree(rtmp); PetscFree(v_work);
2225d517e7eSBarry Smith   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
2235d517e7eSBarry Smith   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
2245d517e7eSBarry Smith   ierr = ISDestroy(isicol); CHKERRQ(ierr);
2255d517e7eSBarry Smith   C->factor = FACTOR_LU;
2265d517e7eSBarry Smith   C->assembled = PETSC_TRUE;
227de6a44a3SBarry Smith   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
2285d517e7eSBarry Smith   return 0;
2295d517e7eSBarry Smith }
230*4eeb42bcSBarry Smith /* ------------------------------------------------------------*/
231*4eeb42bcSBarry Smith /*
232*4eeb42bcSBarry Smith       Version for when blocks are 2 by 2
233*4eeb42bcSBarry Smith */
234*4eeb42bcSBarry Smith int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B)
235*4eeb42bcSBarry Smith {
236*4eeb42bcSBarry Smith   Mat             C = *B;
237*4eeb42bcSBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
238*4eeb42bcSBarry Smith   IS              iscol = b->col, isrow = b->row, isicol;
239*4eeb42bcSBarry Smith   int             *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
240*4eeb42bcSBarry Smith   int             *ajtmpold, *ajtmp, nz, row, info;
241*4eeb42bcSBarry Smith   int             *diag_offset=b->diag,*v_pivots,bs = 2,idx;
242*4eeb42bcSBarry Smith   register Scalar *pv,*v,*rtmp,m1,m2,m3,m4,*v_work,*pc,*w,*x,x1,x2,x3,x4;
243*4eeb42bcSBarry Smith   Scalar          p1,p2,p3,p4;
244*4eeb42bcSBarry Smith   register int    *pj;
245*4eeb42bcSBarry Smith 
246*4eeb42bcSBarry Smith   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
247*4eeb42bcSBarry Smith   PLogObjectParent(*B,isicol);
248*4eeb42bcSBarry Smith   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
249*4eeb42bcSBarry Smith   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
250*4eeb42bcSBarry Smith   rtmp  = (Scalar *) PetscMalloc(4*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
251*4eeb42bcSBarry Smith 
252*4eeb42bcSBarry Smith   /* generate work space needed by dense LU factorization */
253*4eeb42bcSBarry Smith   v_work     = (Scalar *) PetscMalloc(6*sizeof(Scalar));CHKPTRQ(v_work);
254*4eeb42bcSBarry Smith   v_pivots   = (int *) (v_work + 4);
255*4eeb42bcSBarry Smith 
256*4eeb42bcSBarry Smith 
257*4eeb42bcSBarry Smith   for ( i=0; i<n; i++ ) {
258*4eeb42bcSBarry Smith     nz    = ai[i+1] - ai[i];
259*4eeb42bcSBarry Smith     ajtmp = aj + ai[i];
260*4eeb42bcSBarry Smith     for  ( j=0; j<nz; j++ ) {
261*4eeb42bcSBarry Smith       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
262*4eeb42bcSBarry Smith     }
263*4eeb42bcSBarry Smith     /* load in initial (unfactored row) */
264*4eeb42bcSBarry Smith     idx      = r[i];
265*4eeb42bcSBarry Smith     nz       = a->i[idx+1] - a->i[idx];
266*4eeb42bcSBarry Smith     ajtmpold = a->j + a->i[idx];
267*4eeb42bcSBarry Smith     v        = a->a + 4*a->i[idx];
268*4eeb42bcSBarry Smith     for ( j=0; j<nz; j++ ) {
269*4eeb42bcSBarry Smith       x    = rtmp+4*ic[ajtmpold[j]];
270*4eeb42bcSBarry Smith       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
271*4eeb42bcSBarry Smith       v    += 4;
272*4eeb42bcSBarry Smith     }
273*4eeb42bcSBarry Smith     row = *ajtmp++;
274*4eeb42bcSBarry Smith     while (row < i) {
275*4eeb42bcSBarry Smith       pc = rtmp + 4*row;
276*4eeb42bcSBarry Smith       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
277*4eeb42bcSBarry Smith       if (p1 || p2 || p3 || p4) {
278*4eeb42bcSBarry Smith         pv = b->a + 4*diag_offset[row];
279*4eeb42bcSBarry Smith         pj = b->j + diag_offset[row] + 1;
280*4eeb42bcSBarry Smith         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
281*4eeb42bcSBarry Smith         pc[0] = m1 = p1*x1 + p3*x2;
282*4eeb42bcSBarry Smith         pc[1] = m2 = p2*x1 + p4*x2;
283*4eeb42bcSBarry Smith         pc[2] = m3 = p1*x3 + p3*x4;
284*4eeb42bcSBarry Smith         pc[3] = m4 = p2*x3 + p4*x4;
285*4eeb42bcSBarry Smith         nz = ai[row+1] - diag_offset[row] - 1;
286*4eeb42bcSBarry Smith         pv += 4;
287*4eeb42bcSBarry Smith         for (j=0; j<nz; j++) {
288*4eeb42bcSBarry Smith           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
289*4eeb42bcSBarry Smith           x    = rtmp + 4*pj[j];
290*4eeb42bcSBarry Smith           x[0] -= m1*x1 + m3*x2;
291*4eeb42bcSBarry Smith           x[1] -= m2*x1 + m4*x2;
292*4eeb42bcSBarry Smith           x[2] -= m1*x3 + m3*x4;
293*4eeb42bcSBarry Smith           x[3] -= m2*x3 + m4*x4;
294*4eeb42bcSBarry Smith           pv   += 4;
295*4eeb42bcSBarry Smith         }
296*4eeb42bcSBarry Smith         PLogFlops(16*nz+12);
297*4eeb42bcSBarry Smith       }
298*4eeb42bcSBarry Smith       row = *ajtmp++;
299*4eeb42bcSBarry Smith     }
300*4eeb42bcSBarry Smith     /* finished row so stick it into b->a */
301*4eeb42bcSBarry Smith     pv = b->a + 4*ai[i];
302*4eeb42bcSBarry Smith     pj = b->j + ai[i];
303*4eeb42bcSBarry Smith     nz = ai[i+1] - ai[i];
304*4eeb42bcSBarry Smith     for ( j=0; j<nz; j++ ) {
305*4eeb42bcSBarry Smith       x     = rtmp+4*pj[j];
306*4eeb42bcSBarry Smith       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
307*4eeb42bcSBarry Smith       pv   += 4;
308*4eeb42bcSBarry Smith     }
309*4eeb42bcSBarry Smith     /* invert diagonal block */
310*4eeb42bcSBarry Smith     w = b->a + 4*diag_offset[i];
311*4eeb42bcSBarry Smith     LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info);
312*4eeb42bcSBarry Smith     v_work[0] = 1.0; v_work[1] = 0.0; v_work[2] = 0.0; v_work[3] = 1.0;
313*4eeb42bcSBarry Smith     LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);
314*4eeb42bcSBarry Smith     w[0] = v_work[0]; w[1] = v_work[1]; w[2] = v_work[2]; w[3] = v_work[3];
315*4eeb42bcSBarry Smith   }
316*4eeb42bcSBarry Smith 
317*4eeb42bcSBarry Smith   PetscFree(rtmp); PetscFree(v_work);
318*4eeb42bcSBarry Smith   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
319*4eeb42bcSBarry Smith   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
320*4eeb42bcSBarry Smith   ierr = ISDestroy(isicol); CHKERRQ(ierr);
321*4eeb42bcSBarry Smith   C->factor = FACTOR_LU;
322*4eeb42bcSBarry Smith   C->assembled = PETSC_TRUE;
323*4eeb42bcSBarry Smith   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
324*4eeb42bcSBarry Smith   return 0;
325*4eeb42bcSBarry Smith }
3267fc0212eSBarry Smith 
3277fc0212eSBarry Smith /* ----------------------------------------------------------- */
3287fc0212eSBarry Smith /*
3297fc0212eSBarry Smith      Version for when blocks are 1 by 1.
3307fc0212eSBarry Smith */
3317fc0212eSBarry Smith int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B)
3327fc0212eSBarry Smith {
3337fc0212eSBarry Smith   Mat         C = *B;
3347fc0212eSBarry Smith   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data;
3357fc0212eSBarry Smith   IS          iscol = b->col, isrow = b->row, isicol;
3367fc0212eSBarry Smith   int         *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
3377fc0212eSBarry Smith   int         *ajtmpold, *ajtmp, nz, row;
3387fc0212eSBarry Smith   int         *diag_offset = b->diag,diag;
3397fc0212eSBarry Smith   register Scalar *pv,*v,*rtmp,multiplier,*pc;
3407fc0212eSBarry Smith   register int    *pj;
3417fc0212eSBarry Smith 
3427fc0212eSBarry Smith   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
3437fc0212eSBarry Smith   PLogObjectParent(*B,isicol);
3447fc0212eSBarry Smith   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
3457fc0212eSBarry Smith   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
3467fc0212eSBarry Smith   rtmp  = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
3477fc0212eSBarry Smith 
3487fc0212eSBarry Smith   for ( i=0; i<n; i++ ) {
3497fc0212eSBarry Smith     nz    = ai[i+1] - ai[i];
3507fc0212eSBarry Smith     ajtmp = aj + ai[i];
3517fc0212eSBarry Smith     for  ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0;
3527fc0212eSBarry Smith 
3537fc0212eSBarry Smith     /* load in initial (unfactored row) */
3547fc0212eSBarry Smith     nz       = a->i[r[i]+1] - a->i[r[i]];
3557fc0212eSBarry Smith     ajtmpold = a->j + a->i[r[i]];
3567fc0212eSBarry Smith     v        = a->a + a->i[r[i]];
3577fc0212eSBarry Smith     for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] =  v[j];
3587fc0212eSBarry Smith 
3597fc0212eSBarry Smith     row = *ajtmp++;
3607fc0212eSBarry Smith     while (row < i) {
3617fc0212eSBarry Smith       pc = rtmp + row;
3627fc0212eSBarry Smith       if (*pc != 0.0) {
3637fc0212eSBarry Smith         pv         = b->a + diag_offset[row];
3647fc0212eSBarry Smith         pj         = b->j + diag_offset[row] + 1;
3657fc0212eSBarry Smith         multiplier = *pc * *pv++;
3667fc0212eSBarry Smith         *pc        = multiplier;
3677fc0212eSBarry Smith         nz         = ai[row+1] - diag_offset[row] - 1;
3687fc0212eSBarry Smith         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
369*4eeb42bcSBarry Smith         PLogFlops(1+2*nz);
3707fc0212eSBarry Smith       }
3717fc0212eSBarry Smith       row = *ajtmp++;
3727fc0212eSBarry Smith     }
3737fc0212eSBarry Smith     /* finished row so stick it into b->a */
3747fc0212eSBarry Smith     pv = b->a + ai[i];
3757fc0212eSBarry Smith     pj = b->j + ai[i];
3767fc0212eSBarry Smith     nz = ai[i+1] - ai[i];
3777fc0212eSBarry Smith     for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];}
3787fc0212eSBarry Smith     diag = diag_offset[i] - ai[i];
3797fc0212eSBarry Smith     /* check pivot entry for current row */
3807fc0212eSBarry Smith     if (pv[diag] == 0.0) {
3817fc0212eSBarry Smith       SETERRQ(1,"MatLUFactorNumeric_SeqAIJ:Zero pivot");
3827fc0212eSBarry Smith     }
3837fc0212eSBarry Smith     pv[diag] = 1.0/pv[diag];
3847fc0212eSBarry Smith   }
3857fc0212eSBarry Smith 
3867fc0212eSBarry Smith   PetscFree(rtmp);
3877fc0212eSBarry Smith   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
3887fc0212eSBarry Smith   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
3897fc0212eSBarry Smith   ierr = ISDestroy(isicol); CHKERRQ(ierr);
3907fc0212eSBarry Smith   C->factor    = FACTOR_LU;
3917fc0212eSBarry Smith   C->assembled = PETSC_TRUE;
3927fc0212eSBarry Smith   PLogFlops(b->n);
3937fc0212eSBarry Smith   return 0;
3947fc0212eSBarry Smith }
3957fc0212eSBarry Smith 
3965d517e7eSBarry Smith /* ----------------------------------------------------------- */
397ec3a8b7bSBarry Smith int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f)
3985d517e7eSBarry Smith {
399ec3a8b7bSBarry Smith   Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data;
4005d517e7eSBarry Smith   int         ierr;
4015d517e7eSBarry Smith   Mat         C;
4025d517e7eSBarry Smith 
403ec3a8b7bSBarry Smith   ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr);
4047fc0212eSBarry Smith   ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr);
4055d517e7eSBarry Smith 
4065d517e7eSBarry Smith   /* free all the data structures from mat */
4075d517e7eSBarry Smith   PetscFree(mat->a);
4085d517e7eSBarry Smith   if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);}
4095d517e7eSBarry Smith   if (mat->diag) PetscFree(mat->diag);
4105d517e7eSBarry Smith   if (mat->ilen) PetscFree(mat->ilen);
4115d517e7eSBarry Smith   if (mat->imax) PetscFree(mat->imax);
4125d517e7eSBarry Smith   if (mat->solve_work) PetscFree(mat->solve_work);
4135d517e7eSBarry Smith   PetscFree(mat);
4145d517e7eSBarry Smith 
4155d517e7eSBarry Smith   PetscMemcpy(A,C,sizeof(struct _Mat));
4165d517e7eSBarry Smith   PetscHeaderDestroy(C);
4175d517e7eSBarry Smith   return 0;
4185d517e7eSBarry Smith }
4195d517e7eSBarry Smith /* ----------------------------------------------------------- */
420ec3a8b7bSBarry Smith int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx)
4215d517e7eSBarry Smith {
422ec3a8b7bSBarry Smith   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
4235d517e7eSBarry Smith   IS          iscol = a->col, isrow = a->row;
424de6a44a3SBarry Smith   int         *r,*c, ierr, i,  n = a->mbs, *vi, *ai = a->i, *aj = a->j;
425de6a44a3SBarry Smith   int         nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1;
426de6a44a3SBarry Smith   Scalar      *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0;
427de6a44a3SBarry Smith   Scalar      _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
428de6a44a3SBarry Smith   register Scalar *x, *b, *lsum, *tmp, *v;
4295d517e7eSBarry Smith 
430ec3a8b7bSBarry Smith   if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix");
4315d517e7eSBarry Smith 
432de6a44a3SBarry Smith   ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba;
433de6a44a3SBarry Smith   ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa;
4345d517e7eSBarry Smith   tmp  = a->solve_work;
4355d517e7eSBarry Smith 
4365d517e7eSBarry Smith   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4375d517e7eSBarry Smith   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1);
4385d517e7eSBarry Smith 
439de6a44a3SBarry Smith   switch (bs) {
440de6a44a3SBarry Smith     case 1:
4415d517e7eSBarry Smith       /* forward solve the lower triangular */
4425d517e7eSBarry Smith       tmp[0] = b[*r++];
4435d517e7eSBarry Smith       for ( i=1; i<n; i++ ) {
444de6a44a3SBarry Smith         v     = aa + ai[i];
445de6a44a3SBarry Smith         vi    = aj + ai[i];
4465d517e7eSBarry Smith         nz    = a->diag[i] - ai[i];
447de6a44a3SBarry Smith         sum1  = b[*r++];
448de6a44a3SBarry Smith         while (nz--) {
449de6a44a3SBarry Smith           sum1 -= (*v++)*tmp[*vi++];
4505d517e7eSBarry Smith         }
451de6a44a3SBarry Smith         tmp[i] = sum1;
452de6a44a3SBarry Smith       }
4535d517e7eSBarry Smith       /* backward solve the upper triangular */
4545d517e7eSBarry Smith       for ( i=n-1; i>=0; i-- ){
455de6a44a3SBarry Smith         v    = aa + a->diag[i] + 1;
456de6a44a3SBarry Smith         vi   = aj + a->diag[i] + 1;
4575d517e7eSBarry Smith         nz   = ai[i+1] - a->diag[i] - 1;
458de6a44a3SBarry Smith         sum1 = tmp[i];
459de6a44a3SBarry Smith         while (nz--) {
460de6a44a3SBarry Smith           sum1 -= (*v++)*tmp[*vi++];
461de6a44a3SBarry Smith         }
462de6a44a3SBarry Smith         x[*c--] = tmp[i] = aa[a->diag[i]]*sum1;
463de6a44a3SBarry Smith       }
464de6a44a3SBarry Smith       break;
465de6a44a3SBarry Smith     case 2:
466de6a44a3SBarry Smith       /* forward solve the lower triangular */
467de6a44a3SBarry Smith       idx    = 2*(*r++);
468de6a44a3SBarry Smith       tmp[0] = b[idx]; tmp[1] = b[1+idx];
469de6a44a3SBarry Smith       for ( i=1; i<n; i++ ) {
470de6a44a3SBarry Smith         v     = aa + 4*ai[i];
471de6a44a3SBarry Smith         vi    = aj + ai[i];
472de6a44a3SBarry Smith         nz    = a->diag[i] - ai[i];
473de6a44a3SBarry Smith         idx   = 2*(*r++);
474de6a44a3SBarry Smith         sum1  = b[idx]; sum2 = b[1+idx];
475de6a44a3SBarry Smith         while (nz--) {
476de6a44a3SBarry Smith           idx   = 2*(*vi++);
477de6a44a3SBarry Smith           x1    = tmp[idx]; x2 = tmp[1+idx];
478de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[2]*x2;
479de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[3]*x2;
480de6a44a3SBarry Smith           v += 4;
481de6a44a3SBarry Smith         }
482de6a44a3SBarry Smith         idx = 2*i;
483de6a44a3SBarry Smith         tmp[idx] = sum1; tmp[1+idx] = sum2;
484de6a44a3SBarry Smith       }
485de6a44a3SBarry Smith       /* backward solve the upper triangular */
486de6a44a3SBarry Smith       for ( i=n-1; i>=0; i-- ){
487de6a44a3SBarry Smith         v    = aa + 4*a->diag[i] + 4;
488de6a44a3SBarry Smith         vi   = aj + a->diag[i] + 1;
489de6a44a3SBarry Smith         nz   = ai[i+1] - a->diag[i] - 1;
490de6a44a3SBarry Smith         idt  = 2*i;
491de6a44a3SBarry Smith         sum1 = tmp[idt]; sum2 = tmp[1+idt];
492de6a44a3SBarry Smith         while (nz--) {
493de6a44a3SBarry Smith           idx   = 2*(*vi++);
494de6a44a3SBarry Smith           x1    = tmp[idx]; x2 = tmp[1+idx];
495de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[2]*x2;
496de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[3]*x2;
497de6a44a3SBarry Smith           v += 4;
498de6a44a3SBarry Smith         }
499de6a44a3SBarry Smith         idc = 2*(*c--);
500de6a44a3SBarry Smith         v   = aa + 4*a->diag[i];
501de6a44a3SBarry Smith         x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
502de6a44a3SBarry Smith         x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
503de6a44a3SBarry Smith       }
504de6a44a3SBarry Smith       break;
505de6a44a3SBarry Smith     case 3:
506de6a44a3SBarry Smith       /* forward solve the lower triangular */
507de6a44a3SBarry Smith       idx    = 3*(*r++);
508de6a44a3SBarry Smith       tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
509de6a44a3SBarry Smith       for ( i=1; i<n; i++ ) {
510de6a44a3SBarry Smith         v     = aa + 9*ai[i];
511de6a44a3SBarry Smith         vi    = aj + ai[i];
512de6a44a3SBarry Smith         nz    = a->diag[i] - ai[i];
513de6a44a3SBarry Smith         idx   = 3*(*r++);
514de6a44a3SBarry Smith         sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
515de6a44a3SBarry Smith         while (nz--) {
516de6a44a3SBarry Smith           idx   = 3*(*vi++);
517de6a44a3SBarry Smith           x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
518de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
519de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
520de6a44a3SBarry Smith           sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
521de6a44a3SBarry Smith           v += 9;
522de6a44a3SBarry Smith         }
523de6a44a3SBarry Smith         idx = 3*i;
524de6a44a3SBarry Smith         tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
525de6a44a3SBarry Smith       }
526de6a44a3SBarry Smith       /* backward solve the upper triangular */
527de6a44a3SBarry Smith       for ( i=n-1; i>=0; i-- ){
528de6a44a3SBarry Smith         v    = aa + 9*a->diag[i] + 9;
529de6a44a3SBarry Smith         vi   = aj + a->diag[i] + 1;
530de6a44a3SBarry Smith         nz   = ai[i+1] - a->diag[i] - 1;
531de6a44a3SBarry Smith         idt  = 3*i;
532de6a44a3SBarry Smith         sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
533de6a44a3SBarry Smith         while (nz--) {
534de6a44a3SBarry Smith           idx   = 3*(*vi++);
535de6a44a3SBarry Smith           x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
536de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
537de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
538de6a44a3SBarry Smith           sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
539de6a44a3SBarry Smith           v += 9;
540de6a44a3SBarry Smith         }
541de6a44a3SBarry Smith         idc = 3*(*c--);
542de6a44a3SBarry Smith         v   = aa + 9*a->diag[i];
543de6a44a3SBarry Smith         x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
544de6a44a3SBarry Smith         x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
545de6a44a3SBarry Smith         x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
546de6a44a3SBarry Smith       }
547de6a44a3SBarry Smith       break;
548de6a44a3SBarry Smith     case 4:
549de6a44a3SBarry Smith       /* forward solve the lower triangular */
550de6a44a3SBarry Smith       idx    = 4*(*r++);
551de6a44a3SBarry Smith       tmp[0] = b[idx];   tmp[1] = b[1+idx];
552de6a44a3SBarry Smith       tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
553de6a44a3SBarry Smith       for ( i=1; i<n; i++ ) {
554de6a44a3SBarry Smith         v     = aa + 16*ai[i];
555de6a44a3SBarry Smith         vi    = aj + ai[i];
556de6a44a3SBarry Smith         nz    = a->diag[i] - ai[i];
557de6a44a3SBarry Smith         idx   = 4*(*r++);
558de6a44a3SBarry Smith         sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
559de6a44a3SBarry Smith         while (nz--) {
560de6a44a3SBarry Smith           idx   = 4*(*vi++);
561de6a44a3SBarry Smith           x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
562de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
563de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
564de6a44a3SBarry Smith           sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
565de6a44a3SBarry Smith           sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
566de6a44a3SBarry Smith           v += 16;
567de6a44a3SBarry Smith         }
568de6a44a3SBarry Smith         idx = 4*i;
569de6a44a3SBarry Smith         tmp[idx]   = sum1;tmp[1+idx] = sum2;
570de6a44a3SBarry Smith         tmp[2+idx] = sum3;tmp[3+idx] = sum4;
571de6a44a3SBarry Smith       }
572de6a44a3SBarry Smith       /* backward solve the upper triangular */
573de6a44a3SBarry Smith       for ( i=n-1; i>=0; i-- ){
574de6a44a3SBarry Smith         v    = aa + 16*a->diag[i] + 16;
575de6a44a3SBarry Smith         vi   = aj + a->diag[i] + 1;
576de6a44a3SBarry Smith         nz   = ai[i+1] - a->diag[i] - 1;
577de6a44a3SBarry Smith         idt  = 4*i;
578de6a44a3SBarry Smith         sum1 = tmp[idt];  sum2 = tmp[1+idt];
579de6a44a3SBarry Smith         sum3 = tmp[2+idt];sum4 = tmp[3+idt];
580de6a44a3SBarry Smith         while (nz--) {
581de6a44a3SBarry Smith           idx   = 4*(*vi++);
582de6a44a3SBarry Smith           x1    = tmp[idx];   x2 = tmp[1+idx];
583de6a44a3SBarry Smith           x3    = tmp[2+idx]; x4 = tmp[3+idx];
584de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
585de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
586de6a44a3SBarry Smith           sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
587de6a44a3SBarry Smith           sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
588de6a44a3SBarry Smith           v += 16;
589de6a44a3SBarry Smith         }
590de6a44a3SBarry Smith         idc = 4*(*c--);
591de6a44a3SBarry Smith         v   = aa + 16*a->diag[i];
592de6a44a3SBarry Smith         x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
593de6a44a3SBarry Smith         x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
594de6a44a3SBarry Smith         x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
595de6a44a3SBarry Smith         x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
596de6a44a3SBarry Smith       }
597de6a44a3SBarry Smith       break;
598de6a44a3SBarry Smith     case 5:
599de6a44a3SBarry Smith       /* forward solve the lower triangular */
600de6a44a3SBarry Smith       idx    = 5*(*r++);
601de6a44a3SBarry Smith       tmp[0] = b[idx];   tmp[1] = b[1+idx];
602de6a44a3SBarry Smith       tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
603de6a44a3SBarry Smith       for ( i=1; i<n; i++ ) {
604de6a44a3SBarry Smith         v     = aa + 25*ai[i];
605de6a44a3SBarry Smith         vi    = aj + ai[i];
606de6a44a3SBarry Smith         nz    = a->diag[i] - ai[i];
607de6a44a3SBarry Smith         idx   = 5*(*r++);
608de6a44a3SBarry Smith         sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
609de6a44a3SBarry Smith         sum5  = b[4+idx];
610de6a44a3SBarry Smith         while (nz--) {
611de6a44a3SBarry Smith           idx   = 5*(*vi++);
612de6a44a3SBarry Smith           x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
613de6a44a3SBarry Smith           x4    = tmp[3+idx];x5 = tmp[4+idx];
614de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
615de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
616de6a44a3SBarry Smith           sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
617de6a44a3SBarry Smith           sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
618de6a44a3SBarry Smith           sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
619de6a44a3SBarry Smith           v += 25;
620de6a44a3SBarry Smith         }
621de6a44a3SBarry Smith         idx = 5*i;
622de6a44a3SBarry Smith         tmp[idx]   = sum1;tmp[1+idx] = sum2;
623de6a44a3SBarry Smith         tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
624de6a44a3SBarry Smith       }
625de6a44a3SBarry Smith       /* backward solve the upper triangular */
626de6a44a3SBarry Smith       for ( i=n-1; i>=0; i-- ){
627de6a44a3SBarry Smith         v    = aa + 25*a->diag[i] + 25;
628de6a44a3SBarry Smith         vi   = aj + a->diag[i] + 1;
629de6a44a3SBarry Smith         nz   = ai[i+1] - a->diag[i] - 1;
630de6a44a3SBarry Smith         idt  = 5*i;
631de6a44a3SBarry Smith         sum1 = tmp[idt];  sum2 = tmp[1+idt];
632de6a44a3SBarry Smith         sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
633de6a44a3SBarry Smith         while (nz--) {
634de6a44a3SBarry Smith           idx   = 5*(*vi++);
635de6a44a3SBarry Smith           x1    = tmp[idx];   x2 = tmp[1+idx];
636de6a44a3SBarry Smith           x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
637de6a44a3SBarry Smith           sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
638de6a44a3SBarry Smith           sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
639de6a44a3SBarry Smith           sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
640de6a44a3SBarry Smith           sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
641de6a44a3SBarry Smith           sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
642de6a44a3SBarry Smith           v += 25;
643de6a44a3SBarry Smith         }
644de6a44a3SBarry Smith         idc = 5*(*c--);
645de6a44a3SBarry Smith         v   = aa + 25*a->diag[i];
646de6a44a3SBarry Smith         x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
647de6a44a3SBarry Smith                                 v[15]*sum4+v[20]*sum5;
648de6a44a3SBarry Smith         x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
649de6a44a3SBarry Smith                                 v[16]*sum4+v[21]*sum5;
650de6a44a3SBarry Smith         x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
651de6a44a3SBarry Smith                                 v[17]*sum4+v[22]*sum5;
652de6a44a3SBarry Smith         x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
653de6a44a3SBarry Smith                                 v[18]*sum4+v[23]*sum5;
654de6a44a3SBarry Smith         x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
655de6a44a3SBarry Smith                                 v[19]*sum4+v[24]*sum5;
656de6a44a3SBarry Smith       }
657de6a44a3SBarry Smith       break;
658de6a44a3SBarry Smith     default: {
659de6a44a3SBarry Smith       /* forward solve the lower triangular */
660de6a44a3SBarry Smith       PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar));
661de6a44a3SBarry Smith       for ( i=1; i<n; i++ ) {
662de6a44a3SBarry Smith         v   = aa + bs2*ai[i];
663de6a44a3SBarry Smith         vi  = aj + ai[i];
664de6a44a3SBarry Smith         nz  = a->diag[i] - ai[i];
665de6a44a3SBarry Smith         sum = tmp + bs*i;
666de6a44a3SBarry Smith         PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));
667de6a44a3SBarry Smith         while (nz--) {
668de6a44a3SBarry Smith           LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One);
669de6a44a3SBarry Smith           v += bs2;
670de6a44a3SBarry Smith         }
671de6a44a3SBarry Smith       }
672de6a44a3SBarry Smith       /* backward solve the upper triangular */
673de6a44a3SBarry Smith       lsum = a->solve_work + a->n;
674de6a44a3SBarry Smith       for ( i=n-1; i>=0; i-- ){
675de6a44a3SBarry Smith         v   = aa + bs2*(a->diag[i] + 1);
676de6a44a3SBarry Smith         vi  = aj + a->diag[i] + 1;
677de6a44a3SBarry Smith         nz  = ai[i+1] - a->diag[i] - 1;
678de6a44a3SBarry Smith         PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));
679de6a44a3SBarry Smith         while (nz--) {
680de6a44a3SBarry Smith           LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One);
681de6a44a3SBarry Smith           v += bs2;
682de6a44a3SBarry Smith         }
683de6a44a3SBarry Smith         LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero,
684de6a44a3SBarry Smith                 tmp+i*bs,&_One);
685de6a44a3SBarry Smith         PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));
686de6a44a3SBarry Smith       }
687de6a44a3SBarry Smith     }
6885d517e7eSBarry Smith   }
6895d517e7eSBarry Smith 
6905d517e7eSBarry Smith   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
6915d517e7eSBarry Smith   ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr);
692de6a44a3SBarry Smith   PLogFlops(2*bs2*a->nz - a->n);
6935d517e7eSBarry Smith   return 0;
6945d517e7eSBarry Smith }
6955d517e7eSBarry Smith 
6965d517e7eSBarry Smith /* ----------------------------------------------------------------*/
697ec3a8b7bSBarry Smith /*
698ec3a8b7bSBarry Smith      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
699ec3a8b7bSBarry Smith    except that the data structure of Mat_SeqAIJ is slightly different.
700ec3a8b7bSBarry Smith    Not a good example of code reuse.
701ec3a8b7bSBarry Smith    */
702ec3a8b7bSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
703ec3a8b7bSBarry Smith                                  Mat *fact)
7045d517e7eSBarry Smith {
705ec3a8b7bSBarry Smith   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
7065d517e7eSBarry Smith   IS          isicol;
707ec3a8b7bSBarry Smith   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
7085d517e7eSBarry Smith   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
7095d517e7eSBarry Smith   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
710de6a44a3SBarry Smith   int         incrlev,nnz,i,bs = a->bs;
7115d517e7eSBarry Smith 
712ec3a8b7bSBarry Smith   if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square");
713ec3a8b7bSBarry Smith   if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation");
714ec3a8b7bSBarry Smith   if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation");
7155d517e7eSBarry Smith 
7165d517e7eSBarry Smith   /* special case that simply copies fill pattern */
7175d517e7eSBarry Smith   if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) {
718ec3a8b7bSBarry Smith     ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr);
7195d517e7eSBarry Smith     (*fact)->factor = FACTOR_LU;
720ec3a8b7bSBarry Smith     b               = (Mat_SeqBAIJ *) (*fact)->data;
7215d517e7eSBarry Smith     if (!b->diag) {
722ec3a8b7bSBarry Smith       ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
7235d517e7eSBarry Smith     }
7245d517e7eSBarry Smith     b->row        = isrow;
7255d517e7eSBarry Smith     b->col        = iscol;
726de6a44a3SBarry Smith     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
7275d517e7eSBarry Smith     return 0;
7285d517e7eSBarry Smith   }
7295d517e7eSBarry Smith 
7305d517e7eSBarry Smith   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
7315d517e7eSBarry Smith   ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
7325d517e7eSBarry Smith   ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
7335d517e7eSBarry Smith 
7345d517e7eSBarry Smith   /* get new row pointers */
7355d517e7eSBarry Smith   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
736de6a44a3SBarry Smith   ainew[0] = 0;
7375d517e7eSBarry Smith   /* don't know how many column pointers are needed so estimate */
738de6a44a3SBarry Smith   jmax = (int) (f*ai[n] + 1);
7395d517e7eSBarry Smith   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
7405d517e7eSBarry Smith   /* ajfill is level of fill for each fill entry */
7415d517e7eSBarry Smith   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
7425d517e7eSBarry Smith   /* fill is a linked list of nonzeros in active row */
7435d517e7eSBarry Smith   fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
7445d517e7eSBarry Smith   /* im is level for each filled value */
7455d517e7eSBarry Smith   im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
7465d517e7eSBarry Smith   /* dloc is location of diagonal in factor */
7475d517e7eSBarry Smith   dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
7485d517e7eSBarry Smith   dloc[0]  = 0;
7495d517e7eSBarry Smith   for ( prow=0; prow<n; prow++ ) {
7505d517e7eSBarry Smith     /* first copy previous fill into linked list */
7515d517e7eSBarry Smith     nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
752de6a44a3SBarry Smith     xi      = aj + ai[r[prow]];
7535d517e7eSBarry Smith     fill[n] = n;
7545d517e7eSBarry Smith     while (nz--) {
7555d517e7eSBarry Smith       fm  = n;
756de6a44a3SBarry Smith       idx = ic[*xi++];
7575d517e7eSBarry Smith       do {
7585d517e7eSBarry Smith         m  = fm;
7595d517e7eSBarry Smith         fm = fill[m];
7605d517e7eSBarry Smith       } while (fm < idx);
7615d517e7eSBarry Smith       fill[m]   = idx;
7625d517e7eSBarry Smith       fill[idx] = fm;
7635d517e7eSBarry Smith       im[idx]   = 0;
7645d517e7eSBarry Smith     }
7655d517e7eSBarry Smith     nzi = 0;
7665d517e7eSBarry Smith     row = fill[n];
7675d517e7eSBarry Smith     while ( row < prow ) {
7685d517e7eSBarry Smith       incrlev = im[row] + 1;
7695d517e7eSBarry Smith       nz      = dloc[row];
770de6a44a3SBarry Smith       xi      = ajnew  + ainew[row] + nz;
771de6a44a3SBarry Smith       flev    = ajfill + ainew[row] + nz + 1;
7725d517e7eSBarry Smith       nnz     = ainew[row+1] - ainew[row] - nz - 1;
773de6a44a3SBarry Smith       if (*xi++ != row) {
774ec3a8b7bSBarry Smith         SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot");
7755d517e7eSBarry Smith       }
7765d517e7eSBarry Smith       fm      = row;
7775d517e7eSBarry Smith       while (nnz-- > 0) {
778de6a44a3SBarry Smith         idx = *xi++;
7795d517e7eSBarry Smith         if (*flev + incrlev > levels) {
7805d517e7eSBarry Smith           flev++;
7815d517e7eSBarry Smith           continue;
7825d517e7eSBarry Smith         }
7835d517e7eSBarry Smith         do {
7845d517e7eSBarry Smith           m  = fm;
7855d517e7eSBarry Smith           fm = fill[m];
7865d517e7eSBarry Smith         } while (fm < idx);
7875d517e7eSBarry Smith         if (fm != idx) {
7885d517e7eSBarry Smith           im[idx]   = *flev + incrlev;
7895d517e7eSBarry Smith           fill[m]   = idx;
7905d517e7eSBarry Smith           fill[idx] = fm;
7915d517e7eSBarry Smith           fm        = idx;
7925d517e7eSBarry Smith           nzf++;
7935d517e7eSBarry Smith         }
7945d517e7eSBarry Smith         else {
7955d517e7eSBarry Smith           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
7965d517e7eSBarry Smith         }
7975d517e7eSBarry Smith         flev++;
7985d517e7eSBarry Smith       }
7995d517e7eSBarry Smith       row = fill[row];
8005d517e7eSBarry Smith       nzi++;
8015d517e7eSBarry Smith     }
8025d517e7eSBarry Smith     /* copy new filled row into permanent storage */
8035d517e7eSBarry Smith     ainew[prow+1] = ainew[prow] + nzf;
804de6a44a3SBarry Smith     if (ainew[prow+1] > jmax) {
8055d517e7eSBarry Smith       /* allocate a longer ajnew */
8065d517e7eSBarry Smith       int maxadd;
807de6a44a3SBarry Smith       maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n);
8085d517e7eSBarry Smith       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
8095d517e7eSBarry Smith       jmax += maxadd;
8105d517e7eSBarry Smith       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
811de6a44a3SBarry Smith       PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
8125d517e7eSBarry Smith       PetscFree(ajnew);
8135d517e7eSBarry Smith       ajnew = xi;
8145d517e7eSBarry Smith       /* allocate a longer ajfill */
8155d517e7eSBarry Smith       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
816de6a44a3SBarry Smith       PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
8175d517e7eSBarry Smith       PetscFree(ajfill);
8185d517e7eSBarry Smith       ajfill = xi;
8195d517e7eSBarry Smith       realloc++;
8205d517e7eSBarry Smith     }
821de6a44a3SBarry Smith     xi          = ajnew + ainew[prow];
822de6a44a3SBarry Smith     flev        = ajfill + ainew[prow];
8235d517e7eSBarry Smith     dloc[prow]  = nzi;
8245d517e7eSBarry Smith     fm          = fill[n];
8255d517e7eSBarry Smith     while (nzf--) {
826de6a44a3SBarry Smith       *xi++   = fm;
8275d517e7eSBarry Smith       *flev++ = im[fm];
8285d517e7eSBarry Smith       fm      = fill[fm];
8295d517e7eSBarry Smith     }
8305d517e7eSBarry Smith   }
8315d517e7eSBarry Smith   PetscFree(ajfill);
8325d517e7eSBarry Smith   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
8335d517e7eSBarry Smith   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
8345d517e7eSBarry Smith   ierr = ISDestroy(isicol); CHKERRQ(ierr);
8355d517e7eSBarry Smith   PetscFree(fill); PetscFree(im);
8365d517e7eSBarry Smith 
8375d517e7eSBarry Smith   PLogInfo((PetscObject)A,
838ec3a8b7bSBarry Smith     "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n",
8395d517e7eSBarry Smith                              realloc,f,((double)ainew[n])/((double)ai[prow]));
8405d517e7eSBarry Smith 
8415d517e7eSBarry Smith   /* put together the new matrix */
842ec3a8b7bSBarry Smith   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
843ec3a8b7bSBarry Smith   b = (Mat_SeqBAIJ *) (*fact)->data;
8445d517e7eSBarry Smith   PetscFree(b->imax);
8455d517e7eSBarry Smith   b->singlemalloc = 0;
846de6a44a3SBarry Smith   len = bs*bs*ainew[n]*sizeof(Scalar);
8475d517e7eSBarry Smith   /* the next line frees the default space generated by the Create() */
8485d517e7eSBarry Smith   PetscFree(b->a); PetscFree(b->ilen);
8495d517e7eSBarry Smith   b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
8505d517e7eSBarry Smith   b->j          = ajnew;
8515d517e7eSBarry Smith   b->i          = ainew;
8525d517e7eSBarry Smith   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
8535d517e7eSBarry Smith   b->diag       = dloc;
8545d517e7eSBarry Smith   b->ilen       = 0;
8555d517e7eSBarry Smith   b->imax       = 0;
8565d517e7eSBarry Smith   b->row        = isrow;
8575d517e7eSBarry Smith   b->col        = iscol;
858de6a44a3SBarry Smith   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
8595d517e7eSBarry Smith   CHKPTRQ(b->solve_work);
8605d517e7eSBarry Smith   /* In b structure:  Free imax, ilen, old a, old j.
8615d517e7eSBarry Smith      Allocate dloc, solve_work, new a, new j */
862de6a44a3SBarry Smith   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar));
863de6a44a3SBarry Smith   b->maxnz          = b->nz = ainew[n];
8645d517e7eSBarry Smith   (*fact)->factor   = FACTOR_LU;
8655d517e7eSBarry Smith   return 0;
8665d517e7eSBarry Smith }
8675d517e7eSBarry Smith 
8685d517e7eSBarry Smith 
8695d517e7eSBarry Smith 
8705d517e7eSBarry Smith 
871