xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision bbf0e169f9fbcb77d596644d33cfb942ebffc0f0)
1 #ifndef lint
2 static char vcid[] = "$Id: baijfact.c,v 1.29 1996/08/22 20:07:06 curfman Exp bsmith $";
3 #endif
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "src/mat/impls/baij/seq/baij.h"
9 #include "src/vec/vecimpl.h"
10 #include "src/inline/ilu.h"
11 
12 
13 /*
14     The symbolic factorization code is identical to that for AIJ format,
15   except for very small changes since this is now a SeqBAIJ datastructure.
16   NOT good code reuse.
17 */
18 int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B)
19 {
20   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
21   IS          isicol;
22   int         *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j;
23   int         *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs, bs2=a->bs2;
24   int         *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im;
25 
26   PetscValidHeaderSpecific(isrow,IS_COOKIE);
27   PetscValidHeaderSpecific(iscol,IS_COOKIE);
28   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
29   ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic);
30 
31   /* get new row pointers */
32   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
33   ainew[0] = 0;
34   /* don't know how many column pointers are needed so estimate */
35   jmax = (int) (f*ai[n] + 1);
36   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
37   /* fill is a linked list of nonzeros in active row */
38   fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill);
39   im = fill + n + 1;
40   /* idnew is location of diagonal in factor */
41   idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew);
42   idnew[0] = 0;
43 
44   for ( i=0; i<n; i++ ) {
45     /* first copy previous fill into linked list */
46     nnz     = nz    = ai[r[i]+1] - ai[r[i]];
47     ajtmp   = aj + ai[r[i]];
48     fill[n] = n;
49     while (nz--) {
50       fm  = n;
51       idx = ic[*ajtmp++];
52       do {
53         m  = fm;
54         fm = fill[m];
55       } while (fm < idx);
56       fill[m]   = idx;
57       fill[idx] = fm;
58     }
59     row = fill[n];
60     while ( row < i ) {
61       ajtmp = ajnew + idnew[row] + 1;
62       nzbd  = 1 + idnew[row] - ainew[row];
63       nz    = im[row] - nzbd;
64       fm    = row;
65       while (nz-- > 0) {
66         idx = *ajtmp++;
67         nzbd++;
68         if (idx == i) im[row] = nzbd;
69         do {
70           m  = fm;
71           fm = fill[m];
72         } while (fm < idx);
73         if (fm != idx) {
74           fill[m]   = idx;
75           fill[idx] = fm;
76           fm        = idx;
77           nnz++;
78         }
79       }
80       row = fill[row];
81     }
82     /* copy new filled row into permanent storage */
83     ainew[i+1] = ainew[i] + nnz;
84     if (ainew[i+1] > jmax) {
85       /* allocate a longer ajnew */
86       int maxadd;
87       maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n);
88       if (maxadd < nnz) maxadd = (n-i)*(nnz+1);
89       jmax += maxadd;
90       ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp);
91       PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int));
92       PetscFree(ajnew);
93       ajnew = ajtmp;
94       realloc++; /* count how many times we realloc */
95     }
96     ajtmp = ajnew + ainew[i];
97     fm    = fill[n];
98     nzi   = 0;
99     im[i] = nnz;
100     while (nnz--) {
101       if (fm < i) nzi++;
102       *ajtmp++ = fm;
103       fm       = fill[fm];
104     }
105     idnew[i] = ainew[i] + nzi;
106   }
107 
108   PLogInfo(A,
109     "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
110                              realloc,f,((double)ainew[n])/((double)ai[i]));
111 
112   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
113   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
114 
115   PetscFree(fill);
116 
117   /* put together the new matrix */
118   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr);
119   PLogObjectParent(*B,isicol);
120   ierr = ISDestroy(isicol); CHKERRQ(ierr);
121   b = (Mat_SeqBAIJ *) (*B)->data;
122   PetscFree(b->imax);
123   b->singlemalloc = 0;
124   len             = ainew[n]*sizeof(Scalar);
125   /* the next line frees the default space generated by the Create() */
126   PetscFree(b->a); PetscFree(b->ilen);
127   b->a          = (Scalar *) PetscMalloc( len*bs2 ); CHKPTRQ(b->a);
128   b->j          = ajnew;
129   b->i          = ainew;
130   b->diag       = idnew;
131   b->ilen       = 0;
132   b->imax       = 0;
133   b->row        = isrow;
134   b->col        = iscol;
135   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
136   CHKPTRQ(b->solve_work);
137   /* In b structure:  Free imax, ilen, old a, old j.
138      Allocate idnew, solve_work, new a, new j */
139   PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar)));
140   b->maxnz = b->nz = ainew[n];
141 
142   (*B)->info.factor_mallocs    = realloc;
143   (*B)->info.fill_ratio_given  = f;
144   (*B)->info.fill_ratio_needed = ((double)ainew[n])/((double)ai[i]);
145 
146   return 0;
147 }
148 
149 /* ----------------------------------------------------------- */
150 int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B)
151 {
152   Mat             C = *B;
153   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
154   IS              iscol = b->col, isrow = b->row, isicol;
155   int             *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j;
156   int             *ajtmpold, *ajtmp, nz, row, bslog,*ai=a->i,*aj=a->j;
157   int             *diag_offset=b->diag,diag,bs=a->bs,bs2 = bs*bs,*v_pivots;
158   Scalar          *ba = b->a,*aa = a->a;
159   register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc,*w;
160   register int    *pj;
161 
162   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
163   PLogObjectParent(*B,isicol);
164   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
165   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
166   rtmp  = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
167   PetscMemzero(rtmp,bs2*(n+1)*sizeof(Scalar));
168   /* generate work space needed by dense LU factorization */
169   v_work     = (Scalar *) PetscMalloc(bs*sizeof(int) + (bs+bs2)*sizeof(Scalar));
170                CHKPTRQ(v_work);
171   multiplier = v_work + bs;
172   v_pivots   = (int *) (multiplier + bs2);
173 
174   /* flops in while loop */
175   bslog = 2*bs*bs2;
176 
177   for ( i=0; i<n; i++ ) {
178     nz    = bi[i+1] - bi[i];
179     ajtmp = bj + bi[i];
180     for  ( j=0; j<nz; j++ ) {
181       PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(Scalar));
182     }
183     /* load in initial (unfactored row) */
184     nz       = ai[r[i]+1] - ai[r[i]];
185     ajtmpold = aj + ai[r[i]];
186     v        = aa + bs2*ai[r[i]];
187     for ( j=0; j<nz; j++ ) {
188       PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(Scalar));
189     }
190     row = *ajtmp++;
191     while (row < i) {
192       pc = rtmp + bs2*row;
193 /*      if (*pc) { */
194         pv = ba + bs2*diag_offset[row];
195         pj = bj + diag_offset[row] + 1;
196         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier);
197         nz = bi[row+1] - diag_offset[row] - 1;
198         pv += bs2;
199         for (j=0; j<nz; j++) {
200           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
201         }
202         PLogFlops(bslog*(nz+1)-bs);
203 /*      } */
204         row = *ajtmp++;
205     }
206     /* finished row so stick it into b->a */
207     pv = ba + bs2*bi[i];
208     pj = bj + bi[i];
209     nz = bi[i+1] - bi[i];
210     for ( j=0; j<nz; j++ ) {
211       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(Scalar));
212     }
213     diag = diag_offset[i] - bi[i];
214     /* invert diagonal block */
215     w = pv + bs2*diag;
216     Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);
217   }
218 
219   PetscFree(rtmp); PetscFree(v_work);
220   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
221   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
222   ierr = ISDestroy(isicol); CHKERRQ(ierr);
223   C->factor = FACTOR_LU;
224   C->assembled = PETSC_TRUE;
225   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
226   return 0;
227 }
228 /* ------------------------------------------------------------*/
229 /*
230       Version for when blocks are 5 by 5
231 */
232 int MatLUFactorNumeric_SeqBAIJ_5(Mat A,Mat *B)
233 {
234   Mat             C = *B;
235   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
236   IS              iscol = b->col, isrow = b->row, isicol;
237   int             *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j;
238   int             *ajtmpold, *ajtmp, nz, row, v_pivots[5];
239   int             *diag_offset = b->diag,bs = 5,idx,*ai=a->i,*aj=a->j;
240   register Scalar *pv,*v,*rtmp,*pc,*w,*x;
241   Scalar          p1,p2,p3,p4,v_work[5],m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
242   Scalar          p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
243   Scalar          x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14;
244   Scalar          p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12;
245   Scalar          m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
246   Scalar          *ba = b->a,*aa = a->a;
247   register int    *pj;
248 
249   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
250   PLogObjectParent(*B,isicol);
251   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
252   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
253   rtmp  = (Scalar *) PetscMalloc(25*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
254 
255   for ( i=0; i<n; i++ ) {
256     nz    = bi[i+1] - bi[i];
257     ajtmp = bj + bi[i];
258     for  ( j=0; j<nz; j++ ) {
259       x = rtmp+25*ajtmp[j];
260       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
261       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
262       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
263     }
264     /* load in initial (unfactored row) */
265     idx      = r[i];
266     nz       = ai[idx+1] - ai[idx];
267     ajtmpold = aj + ai[idx];
268     v        = aa + 25*ai[idx];
269     for ( j=0; j<nz; j++ ) {
270       x    = rtmp+25*ic[ajtmpold[j]];
271       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
272       x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
273       x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
274       x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17];
275       x[18] = v[18]; x[19] = v[19]; x[20] = v[20]; x[21] = v[21];
276       x[22] = v[22]; x[23] = v[23]; x[24] = v[24];
277       v    += 25;
278     }
279     row = *ajtmp++;
280     while (row < i) {
281       pc = rtmp + 25*row;
282       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
283       p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
284       p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
285       p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; p19 = pc[18];
286       p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
287       p25 = pc[24];
288       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
289           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
290           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
291           || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 ||
292           p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 ||
293           p24 != 0.0 || p25 != 0.0) {
294         pv = ba + 25*diag_offset[row];
295         pj = bj + diag_offset[row] + 1;
296         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
297         x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
298         x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
299         x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17];
300         x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21];
301         x23 = pv[22]; x24 = pv[23]; x25 = pv[24];
302         pc[0] = m1 = p1*x1 + p6*x2  + p11*x3 + p16*x4 + p21*x5;
303         pc[1] = m2 = p2*x1 + p7*x2  + p12*x3 + p17*x4 + p22*x5;
304         pc[2] = m3 = p3*x1 + p8*x2  + p13*x3 + p18*x4 + p23*x5;
305         pc[3] = m4 = p4*x1 + p9*x2  + p14*x3 + p19*x4 + p24*x5;
306         pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5;
307 
308         pc[5] = m6 = p1*x6 + p6*x7  + p11*x8 + p16*x9 + p21*x10;
309         pc[6] = m7 = p2*x6 + p7*x7  + p12*x8 + p17*x9 + p22*x10;
310         pc[7] = m8 = p3*x6 + p8*x7  + p13*x8 + p18*x9 + p23*x10;
311         pc[8] = m9 = p4*x6 + p9*x7  + p14*x8 + p19*x9 + p24*x10;
312         pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10;
313 
314         pc[10] = m11 = p1*x11 + p6*x12  + p11*x13 + p16*x14 + p21*x15;
315         pc[11] = m12 = p2*x11 + p7*x12  + p12*x13 + p17*x14 + p22*x15;
316         pc[12] = m13 = p3*x11 + p8*x12  + p13*x13 + p18*x14 + p23*x15;
317         pc[13] = m14 = p4*x11 + p9*x12  + p14*x13 + p19*x14 + p24*x15;
318         pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15;
319 
320         pc[15] = m16 = p1*x16 + p6*x17  + p11*x18 + p16*x19 + p21*x20;
321         pc[16] = m17 = p2*x16 + p7*x17  + p12*x18 + p17*x19 + p22*x20;
322         pc[17] = m18 = p3*x16 + p8*x17  + p13*x18 + p18*x19 + p23*x20;
323         pc[18] = m19 = p4*x16 + p9*x17  + p14*x18 + p19*x19 + p24*x20;
324         pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20;
325 
326         pc[20] = m21 = p1*x21 + p6*x22  + p11*x23 + p16*x24 + p21*x25;
327         pc[21] = m22 = p2*x21 + p7*x22  + p12*x23 + p17*x24 + p22*x25;
328         pc[22] = m23 = p3*x21 + p8*x22  + p13*x23 + p18*x24 + p23*x25;
329         pc[23] = m24 = p4*x21 + p9*x22  + p14*x23 + p19*x24 + p24*x25;
330         pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25;
331 
332         nz = bi[row+1] - diag_offset[row] - 1;
333         pv += 25;
334         for (j=0; j<nz; j++) {
335           x1   = pv[0];  x2 = pv[1];   x3  = pv[2];  x4  = pv[3];
336           x5   = pv[4];  x6 = pv[5];   x7  = pv[6];  x8  = pv[7]; x9 = pv[8];
337           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
338           x14  = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16];
339           x18  = pv[17]; x19 = pv[18]; x20 = pv[19]; x21 = pv[20];
340           x22  = pv[21]; x23 = pv[22]; x24 = pv[23]; x25 = pv[24];
341           x    = rtmp + 25*pj[j];
342           x[0] -= m1*x1 + m6*x2  + m11*x3 + m16*x4 + m21*x5;
343           x[1] -= m2*x1 + m7*x2  + m12*x3 + m17*x4 + m22*x5;
344           x[2] -= m3*x1 + m8*x2  + m13*x3 + m18*x4 + m23*x5;
345           x[3] -= m4*x1 + m9*x2  + m14*x3 + m19*x4 + m24*x5;
346           x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5;
347 
348           x[5] -= m1*x6 + m6*x7  + m11*x8 + m16*x9 + m21*x10;
349           x[6] -= m2*x6 + m7*x7  + m12*x8 + m17*x9 + m22*x10;
350           x[7] -= m3*x6 + m8*x7  + m13*x8 + m18*x9 + m23*x10;
351           x[8] -= m4*x6 + m9*x7  + m14*x8 + m19*x9 + m24*x10;
352           x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10;
353 
354           x[10] -= m1*x11 + m6*x12  + m11*x13 + m16*x14 + m21*x15;
355           x[11] -= m2*x11 + m7*x12  + m12*x13 + m17*x14 + m22*x15;
356           x[12] -= m3*x11 + m8*x12  + m13*x13 + m18*x14 + m23*x15;
357           x[13] -= m4*x11 + m9*x12  + m14*x13 + m19*x14 + m24*x15;
358           x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15;
359 
360           x[15] -= m1*x16 + m6*x17  + m11*x18 + m16*x19 + m21*x20;
361           x[16] -= m2*x16 + m7*x17  + m12*x18 + m17*x19 + m22*x20;
362           x[17] -= m3*x16 + m8*x17  + m13*x18 + m18*x19 + m23*x20;
363           x[18] -= m4*x16 + m9*x17  + m14*x18 + m19*x19 + m24*x20;
364           x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20;
365 
366           x[20] -= m1*x21 + m6*x22  + m11*x23 + m16*x24 + m21*x25;
367           x[21] -= m2*x21 + m7*x22  + m12*x23 + m17*x24 + m22*x25;
368           x[22] -= m3*x21 + m8*x22  + m13*x23 + m18*x24 + m23*x25;
369           x[23] -= m4*x21 + m9*x22  + m14*x23 + m19*x24 + m24*x25;
370           x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25;
371 
372           pv   += 25;
373         }
374         PLogFlops(250*nz+225);
375       }
376       row = *ajtmp++;
377     }
378     /* finished row so stick it into b->a */
379     pv = ba + 25*bi[i];
380     pj = bj + bi[i];
381     nz = bi[i+1] - bi[i];
382     for ( j=0; j<nz; j++ ) {
383       x     = rtmp+25*pj[j];
384       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
385       pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
386       pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
387       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16];
388       pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20];
389       pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; pv[24] = x[24];
390       pv   += 25;
391     }
392     /* invert diagonal block */
393     w = ba + 25*diag_offset[i];
394     Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);
395   }
396 
397   PetscFree(rtmp);
398   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
399   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
400   ierr = ISDestroy(isicol); CHKERRQ(ierr);
401   C->factor = FACTOR_LU;
402   C->assembled = PETSC_TRUE;
403   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
404   return 0;
405 }
406 
407 /* ------------------------------------------------------------*/
408 /*
409       Version for when blocks are 4 by 4
410 */
411 int MatLUFactorNumeric_SeqBAIJ_4(Mat A,Mat *B)
412 {
413   Mat             C = *B;
414   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
415   IS              iscol = b->col, isrow = b->row, isicol;
416   int             *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j;
417   int             *ajtmpold, *ajtmp, nz, row, v_pivots[4];
418   int             *diag_offset = b->diag,bs = 4,idx,*ai=a->i,*aj=a->j;
419   register Scalar *pv,*v,*rtmp,*pc,*w,*x;
420   Scalar          p1,p2,p3,p4,v_work[4],m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
421   Scalar          p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
422   Scalar          p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
423   Scalar          m13,m14,m15,m16;
424   Scalar          *ba = b->a,*aa = a->a;
425   register int    *pj;
426 
427   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
428   PLogObjectParent(*B,isicol);
429   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
430   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
431   rtmp  = (Scalar *) PetscMalloc(16*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
432 
433   for ( i=0; i<n; i++ ) {
434     nz    = bi[i+1] - bi[i];
435     ajtmp = bj + bi[i];
436     for  ( j=0; j<nz; j++ ) {
437       x = rtmp+16*ajtmp[j];
438       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
439       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
440     }
441     /* load in initial (unfactored row) */
442     idx      = r[i];
443     nz       = ai[idx+1] - ai[idx];
444     ajtmpold = aj + ai[idx];
445     v        = aa + 16*ai[idx];
446     for ( j=0; j<nz; j++ ) {
447       x    = rtmp+16*ic[ajtmpold[j]];
448       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
449       x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
450       x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
451       x[14] = v[14]; x[15] = v[15];
452       v    += 16;
453     }
454     row = *ajtmp++;
455     while (row < i) {
456       pc = rtmp + 16*row;
457       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
458       p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
459       p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
460       p15 = pc[14]; p16 = pc[15];
461       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
462           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
463           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
464           || p16 != 0.0) {
465         pv = ba + 16*diag_offset[row];
466         pj = bj + diag_offset[row] + 1;
467         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
468         x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
469         x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
470         x15 = pv[14]; x16 = pv[15];
471         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
472         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
473         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
474         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
475 
476         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
477         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
478         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
479         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
480 
481         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
482         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
483         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
484         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
485 
486         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
487         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
488         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
489         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
490 
491         nz = bi[row+1] - diag_offset[row] - 1;
492         pv += 16;
493         for (j=0; j<nz; j++) {
494           x1   = pv[0];  x2 = pv[1];   x3  = pv[2];  x4  = pv[3];
495           x5   = pv[4];  x6 = pv[5];   x7  = pv[6];  x8  = pv[7]; x9 = pv[8];
496           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
497           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
498           x    = rtmp + 16*pj[j];
499           x[0] -= m1*x1 + m5*x2  + m9*x3 + m13*x4;
500           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
501           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
502           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
503 
504           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
505           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
506           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
507           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
508 
509           x[8] -= m1*x9  + m5*x10  + m9*x11 + m13*x12;
510           x[9] -= m2*x9  + m6*x10  + m10*x11 + m14*x12;
511           x[10] -= m3*x9 + m7*x10  + m11*x11 + m15*x12;
512           x[11] -= m4*x9 + m8*x10  + m12*x11 + m16*x12;
513 
514           x[12] -= m1*x13 + m5*x14  + m9*x15 + m13*x16;
515           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
516           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
517           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
518 
519           pv   += 16;
520         }
521         PLogFlops(128*nz+112);
522       }
523       row = *ajtmp++;
524     }
525     /* finished row so stick it into b->a */
526     pv = ba + 16*bi[i];
527     pj = bj + bi[i];
528     nz = bi[i+1] - bi[i];
529     for ( j=0; j<nz; j++ ) {
530       x     = rtmp+16*pj[j];
531       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
532       pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
533       pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
534       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
535       pv   += 16;
536     }
537     /* invert diagonal block */
538     w = ba + 16*diag_offset[i];
539     Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);
540   }
541 
542   PetscFree(rtmp);
543   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
544   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
545   ierr = ISDestroy(isicol); CHKERRQ(ierr);
546   C->factor = FACTOR_LU;
547   C->assembled = PETSC_TRUE;
548   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
549   return 0;
550 }
551 /* ------------------------------------------------------------*/
552 /*
553       Version for when blocks are 3 by 3
554 */
555 int MatLUFactorNumeric_SeqBAIJ_3(Mat A,Mat *B)
556 {
557   Mat             C = *B;
558   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
559   IS              iscol = b->col, isrow = b->row, isicol;
560   int             *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j;
561   int             *ajtmpold, *ajtmp, nz, row, *ai=a->i,*aj=a->j;
562   int             *diag_offset = b->diag,idx;
563   register Scalar *pv,*v,*rtmp,*pc,*w,*x;
564   Scalar          p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
565   Scalar          p5,p6,p7,p8,p9,x5,x6,x7,x8,x9;
566   Scalar          *ba = b->a,*aa = a->a;
567   register int    *pj;
568 
569   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
570   PLogObjectParent(*B,isicol);
571   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
572   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
573   rtmp  = (Scalar *) PetscMalloc(9*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
574 
575   for ( i=0; i<n; i++ ) {
576     nz    = bi[i+1] - bi[i];
577     ajtmp = bj + bi[i];
578     for  ( j=0; j<nz; j++ ) {
579       x = rtmp + 9*ajtmp[j];
580       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
581     }
582     /* load in initial (unfactored row) */
583     idx      = r[i];
584     nz       = ai[idx+1] - ai[idx];
585     ajtmpold = aj + ai[idx];
586     v        = aa + 9*ai[idx];
587     for ( j=0; j<nz; j++ ) {
588       x    = rtmp + 9*ic[ajtmpold[j]];
589       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
590       x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
591       v    += 9;
592     }
593     row = *ajtmp++;
594     while (row < i) {
595       pc = rtmp + 9*row;
596       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
597       p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
598       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
599           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
600         pv = ba + 9*diag_offset[row];
601         pj = bj + diag_offset[row] + 1;
602         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
603         x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
604         pc[0] = m1 = p1*x1 + p4*x2 + p7*x3;
605         pc[1] = m2 = p2*x1 + p5*x2 + p8*x3;
606         pc[2] = m3 = p3*x1 + p6*x2 + p9*x3;
607 
608         pc[3] = m4 = p1*x4 + p4*x5 + p7*x6;
609         pc[4] = m5 = p2*x4 + p5*x5 + p8*x6;
610         pc[5] = m6 = p3*x4 + p6*x5 + p9*x6;
611 
612         pc[6] = m7 = p1*x7 + p4*x8 + p7*x9;
613         pc[7] = m8 = p2*x7 + p5*x8 + p8*x9;
614         pc[8] = m9 = p3*x7 + p6*x8 + p9*x9;
615         nz = bi[row+1] - diag_offset[row] - 1;
616         pv += 9;
617         for (j=0; j<nz; j++) {
618           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
619           x5   = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
620           x    = rtmp + 9*pj[j];
621           x[0] -= m1*x1 + m4*x2 + m7*x3;
622           x[1] -= m2*x1 + m5*x2 + m8*x3;
623           x[2] -= m3*x1 + m6*x2 + m9*x3;
624 
625           x[3] -= m1*x4 + m4*x5 + m7*x6;
626           x[4] -= m2*x4 + m5*x5 + m8*x6;
627           x[5] -= m3*x4 + m6*x5 + m9*x6;
628 
629           x[6] -= m1*x7 + m4*x8 + m7*x9;
630           x[7] -= m2*x7 + m5*x8 + m8*x9;
631           x[8] -= m3*x7 + m6*x8 + m9*x9;
632           pv   += 9;
633         }
634         PLogFlops(54*nz+36);
635       }
636       row = *ajtmp++;
637     }
638     /* finished row so stick it into b->a */
639     pv = ba + 9*bi[i];
640     pj = bj + bi[i];
641     nz = bi[i+1] - bi[i];
642     for ( j=0; j<nz; j++ ) {
643       x     = rtmp + 9*pj[j];
644       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
645       pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
646       pv   += 9;
647     }
648     /* invert diagonal block */
649     w = ba + 9*diag_offset[i];
650     ierr = Kernel_A_gets_inverse_A_3(w); CHKERRQ(ierr);
651   }
652 
653   PetscFree(rtmp);
654   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
655   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
656   ierr = ISDestroy(isicol); CHKERRQ(ierr);
657   C->factor = FACTOR_LU;
658   C->assembled = PETSC_TRUE;
659   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
660   return 0;
661 }
662 
663 /* ------------------------------------------------------------*/
664 /*
665       Version for when blocks are 2 by 2
666 */
667 int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B)
668 {
669   Mat             C = *B;
670   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
671   IS              iscol = b->col, isrow = b->row, isicol;
672   int             *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j;
673   int             *ajtmpold, *ajtmp, nz, row, v_pivots[2];
674   int             *diag_offset=b->diag,bs = 2,idx,*ai=a->i,*aj=a->j;
675   register Scalar *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4;
676   Scalar          p1,p2,p3,p4,v_work[2];
677   Scalar          *ba = b->a,*aa = a->a;
678   register int    *pj;
679 
680   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
681   PLogObjectParent(*B,isicol);
682   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
683   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
684   rtmp  = (Scalar *) PetscMalloc(4*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
685 
686   for ( i=0; i<n; i++ ) {
687     nz    = bi[i+1] - bi[i];
688     ajtmp = bj + bi[i];
689     for  ( j=0; j<nz; j++ ) {
690       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
691     }
692     /* load in initial (unfactored row) */
693     idx      = r[i];
694     nz       = ai[idx+1] - ai[idx];
695     ajtmpold = aj + ai[idx];
696     v        = aa + 4*ai[idx];
697     for ( j=0; j<nz; j++ ) {
698       x    = rtmp+4*ic[ajtmpold[j]];
699       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
700       v    += 4;
701     }
702     row = *ajtmp++;
703     while (row < i) {
704       pc = rtmp + 4*row;
705       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
706       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
707         pv = ba + 4*diag_offset[row];
708         pj = bj + diag_offset[row] + 1;
709         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
710         pc[0] = m1 = p1*x1 + p3*x2;
711         pc[1] = m2 = p2*x1 + p4*x2;
712         pc[2] = m3 = p1*x3 + p3*x4;
713         pc[3] = m4 = p2*x3 + p4*x4;
714         nz = bi[row+1] - diag_offset[row] - 1;
715         pv += 4;
716         for (j=0; j<nz; j++) {
717           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
718           x    = rtmp + 4*pj[j];
719           x[0] -= m1*x1 + m3*x2;
720           x[1] -= m2*x1 + m4*x2;
721           x[2] -= m1*x3 + m3*x4;
722           x[3] -= m2*x3 + m4*x4;
723           pv   += 4;
724         }
725         PLogFlops(16*nz+12);
726       }
727       row = *ajtmp++;
728     }
729     /* finished row so stick it into b->a */
730     pv = ba + 4*bi[i];
731     pj = bj + bi[i];
732     nz = bi[i+1] - bi[i];
733     for ( j=0; j<nz; j++ ) {
734       x     = rtmp+4*pj[j];
735       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
736       pv   += 4;
737     }
738     /* invert diagonal block */
739     w = ba + 4*diag_offset[i];
740     Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);
741   }
742 
743   PetscFree(rtmp);
744   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
745   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
746   ierr = ISDestroy(isicol); CHKERRQ(ierr);
747   C->factor = FACTOR_LU;
748   C->assembled = PETSC_TRUE;
749   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
750   return 0;
751 }
752 
753 /* ----------------------------------------------------------- */
754 /*
755      Version for when blocks are 1 by 1.
756 */
757 int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B)
758 {
759   Mat             C = *B;
760   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data;
761   IS              iscol = b->col, isrow = b->row, isicol;
762   int             *r,*ic, ierr, i, j, n = a->mbs, *bi = b->i, *bj = b->j;
763   int             *ajtmpold, *ajtmp, nz, row,*ai = a->i,*aj = a->j;
764   int             *diag_offset = b->diag,diag;
765   register Scalar *pv,*v,*rtmp,multiplier,*pc;
766   Scalar          *ba = b->a,*aa = a->a;
767   register int    *pj;
768 
769   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
770   PLogObjectParent(*B,isicol);
771   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
772   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
773   rtmp  = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
774 
775   for ( i=0; i<n; i++ ) {
776     nz    = bi[i+1] - bi[i];
777     ajtmp = bj + bi[i];
778     for  ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0;
779 
780     /* load in initial (unfactored row) */
781     nz       = ai[r[i]+1] - ai[r[i]];
782     ajtmpold = aj + ai[r[i]];
783     v        = aa + ai[r[i]];
784     for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] =  v[j];
785 
786     row = *ajtmp++;
787     while (row < i) {
788       pc = rtmp + row;
789       if (*pc != 0.0) {
790         pv         = ba + diag_offset[row];
791         pj         = bj + diag_offset[row] + 1;
792         multiplier = *pc * *pv++;
793         *pc        = multiplier;
794         nz         = bi[row+1] - diag_offset[row] - 1;
795         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
796         PLogFlops(1+2*nz);
797       }
798       row = *ajtmp++;
799     }
800     /* finished row so stick it into b->a */
801     pv = ba + bi[i];
802     pj = bj + bi[i];
803     nz = bi[i+1] - bi[i];
804     for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];}
805     diag = diag_offset[i] - bi[i];
806     /* check pivot entry for current row */
807     if (pv[diag] == 0.0) {
808       SETERRQ(1,"MatLUFactorNumeric_SeqBAIJ_1:Zero pivot");
809     }
810     pv[diag] = 1.0/pv[diag];
811   }
812 
813   PetscFree(rtmp);
814   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
815   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
816   ierr = ISDestroy(isicol); CHKERRQ(ierr);
817   C->factor    = FACTOR_LU;
818   C->assembled = PETSC_TRUE;
819   PLogFlops(b->n);
820   return 0;
821 }
822 
823 /* ----------------------------------------------------------- */
824 int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f)
825 {
826   Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data;
827   int         ierr;
828   Mat         C;
829 
830   ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr);
831   ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr);
832 
833   /* free all the data structures from mat */
834   PetscFree(mat->a);
835   if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);}
836   if (mat->diag) PetscFree(mat->diag);
837   if (mat->ilen) PetscFree(mat->ilen);
838   if (mat->imax) PetscFree(mat->imax);
839   if (mat->solve_work) PetscFree(mat->solve_work);
840   if (mat->mult_work) PetscFree(mat->mult_work);
841   PetscFree(mat);
842 
843   PetscMemcpy(A,C,sizeof(struct _Mat));
844   PetscHeaderDestroy(C);
845   return 0;
846 }
847 /* ----------------------------------------------------------- */
848 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
849 {
850   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
851   IS              iscol=a->col,isrow=a->row;
852   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
853   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
854   Scalar          *aa=a->a,*sum;
855   register Scalar *x,*b,*lsum,*tmp,*v;
856 
857   VecGetArray_Fast(bb,b);
858   VecGetArray_Fast(xx,x);
859   tmp  = a->solve_work;
860 
861   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
862   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
863 
864   /* forward solve the lower triangular */
865   PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar));
866   for ( i=1; i<n; i++ ) {
867     v   = aa + bs2*ai[i];
868     vi  = aj + ai[i];
869     nz  = a->diag[i] - ai[i];
870     sum = tmp + bs*i;
871     PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));
872     while (nz--) {
873       Kernel_v_gets_v_minus_A_times_w(bs,sum,v,tmp+bs*(*vi++));
874       v += bs2;
875     }
876   }
877   /* backward solve the upper triangular */
878   lsum = a->solve_work + a->n;
879   for ( i=n-1; i>=0; i-- ){
880     v   = aa + bs2*(a->diag[i] + 1);
881     vi  = aj + a->diag[i] + 1;
882     nz  = ai[i+1] - a->diag[i] - 1;
883     PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));
884     while (nz--) {
885       Kernel_v_gets_v_minus_A_times_w(bs,lsum,v,tmp+bs*(*vi++));
886       v += bs2;
887     }
888     Kernel_w_gets_A_times_v(bs,lsum,aa+bs2*a->diag[i],tmp+i*bs);
889     PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));
890   }
891 
892   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
893   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
894   VecRestoreArray_Fast(bb,b);
895   VecRestoreArray_Fast(xx,x);
896   PLogFlops(2*(a->bs2)*(a->nz) - a->n);
897   return 0;
898 }
899 
900 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
901 {
902   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
903   IS              iscol=a->col,isrow=a->row;
904   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
905   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
906   register Scalar *x,*b,*tmp,*v;
907 
908   VecGetArray_Fast(bb,b);
909   VecGetArray_Fast(xx,x);
910   tmp  = a->solve_work;
911 
912   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
913   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
914 
915   /* forward solve the lower triangular */
916   idx    = 5*(*r++);
917   tmp[0] = b[idx];   tmp[1] = b[1+idx];
918   tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
919   for ( i=1; i<n; i++ ) {
920     v     = aa + 25*ai[i];
921     vi    = aj + ai[i];
922     nz    = a->diag[i] - ai[i];
923     idx   = 5*(*r++);
924     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
925     sum5  = b[4+idx];
926     while (nz--) {
927       idx   = 5*(*vi++);
928       x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
929       x4    = tmp[3+idx];x5 = tmp[4+idx];
930       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
931       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
932       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
933       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
934       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
935       v += 25;
936     }
937     idx = 5*i;
938     tmp[idx]   = sum1;tmp[1+idx] = sum2;
939     tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
940   }
941   /* backward solve the upper triangular */
942   for ( i=n-1; i>=0; i-- ){
943     v    = aa + 25*a->diag[i] + 25;
944     vi   = aj + a->diag[i] + 1;
945     nz   = ai[i+1] - a->diag[i] - 1;
946     idt  = 5*i;
947     sum1 = tmp[idt];  sum2 = tmp[1+idt];
948     sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
949     while (nz--) {
950       idx   = 5*(*vi++);
951       x1    = tmp[idx];   x2 = tmp[1+idx];
952       x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
953       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
954       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
955       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
956       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
957       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
958       v += 25;
959     }
960     idc = 5*(*c--);
961     v   = aa + 25*a->diag[i];
962     x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
963                                  v[15]*sum4+v[20]*sum5;
964     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
965                                  v[16]*sum4+v[21]*sum5;
966     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
967                                  v[17]*sum4+v[22]*sum5;
968     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
969                                  v[18]*sum4+v[23]*sum5;
970     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
971                                  v[19]*sum4+v[24]*sum5;
972   }
973 
974   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
975   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
976   VecRestoreArray_Fast(bb,b);
977   VecRestoreArray_Fast(xx,x);
978   PLogFlops(2*25*(a->nz) - a->n);
979   return 0;
980 }
981 
982 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
983 {
984   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
985   IS              iscol=a->col,isrow=a->row;
986   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
987   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
988   register Scalar *x,*b,*tmp,*v;
989 
990   VecGetArray_Fast(bb,b);
991   VecGetArray_Fast(xx,x);
992   tmp  = a->solve_work;
993 
994   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
995   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
996 
997   /* forward solve the lower triangular */
998   idx    = 4*(*r++);
999   tmp[0] = b[idx];   tmp[1] = b[1+idx];
1000   tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
1001   for ( i=1; i<n; i++ ) {
1002     v     = aa + 16*ai[i];
1003     vi    = aj + ai[i];
1004     nz    = a->diag[i] - ai[i];
1005     idx   = 4*(*r++);
1006     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
1007     while (nz--) {
1008       idx   = 4*(*vi++);
1009       x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
1010       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1011       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1012       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1013       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1014       v += 16;
1015     }
1016     idx = 4*i;
1017     tmp[idx]   = sum1;tmp[1+idx] = sum2;
1018     tmp[2+idx] = sum3;tmp[3+idx] = sum4;
1019   }
1020   /* backward solve the upper triangular */
1021   for ( i=n-1; i>=0; i-- ){
1022     v    = aa + 16*a->diag[i] + 16;
1023     vi   = aj + a->diag[i] + 1;
1024     nz   = ai[i+1] - a->diag[i] - 1;
1025     idt  = 4*i;
1026     sum1 = tmp[idt];  sum2 = tmp[1+idt];
1027     sum3 = tmp[2+idt];sum4 = tmp[3+idt];
1028     while (nz--) {
1029       idx   = 4*(*vi++);
1030       x1    = tmp[idx];   x2 = tmp[1+idx];
1031       x3    = tmp[2+idx]; x4 = tmp[3+idx];
1032       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1033       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1034       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1035       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1036       v += 16;
1037     }
1038     idc = 4*(*c--);
1039     v   = aa + 16*a->diag[i];
1040     x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
1041     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
1042     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
1043     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
1044   }
1045 
1046   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
1047   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
1048   VecRestoreArray_Fast(bb,b);
1049   VecRestoreArray_Fast(xx,x);
1050   PLogFlops(2*16*(a->nz) - a->n);
1051   return 0;
1052 }
1053 
1054 
1055 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1056 {
1057   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1058   IS              iscol=a->col,isrow=a->row;
1059   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1060   Scalar          *aa=a->a,sum1,sum2,sum3,x1,x2,x3;
1061   register Scalar *x,*b,*tmp,*v;
1062 
1063   VecGetArray_Fast(bb,b);
1064   VecGetArray_Fast(xx,x);
1065   tmp  = a->solve_work;
1066 
1067   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1068   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1069 
1070   /* forward solve the lower triangular */
1071   idx    = 3*(*r++);
1072   tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
1073   for ( i=1; i<n; i++ ) {
1074     v     = aa + 9*ai[i];
1075     vi    = aj + ai[i];
1076     nz    = a->diag[i] - ai[i];
1077     idx   = 3*(*r++);
1078     sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
1079     while (nz--) {
1080       idx   = 3*(*vi++);
1081       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
1082       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1083       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1084       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1085       v += 9;
1086     }
1087     idx = 3*i;
1088     tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
1089   }
1090   /* backward solve the upper triangular */
1091   for ( i=n-1; i>=0; i-- ){
1092     v    = aa + 9*a->diag[i] + 9;
1093     vi   = aj + a->diag[i] + 1;
1094     nz   = ai[i+1] - a->diag[i] - 1;
1095     idt  = 3*i;
1096     sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
1097     while (nz--) {
1098       idx   = 3*(*vi++);
1099       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
1100       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1101       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1102       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1103       v += 9;
1104     }
1105     idc = 3*(*c--);
1106     v   = aa + 9*a->diag[i];
1107     x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
1108     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
1109     x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
1110   }
1111   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
1112   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
1113   VecRestoreArray_Fast(bb,b);
1114   VecRestoreArray_Fast(xx,x);
1115   PLogFlops(2*9*(a->nz) - a->n);
1116   return 0;
1117 }
1118 
1119 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1120 {
1121   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1122   IS              iscol=a->col,isrow=a->row;
1123   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1124   Scalar          *aa=a->a,sum1,sum2,x1,x2;
1125   register Scalar *x,*b,*tmp,*v;
1126 
1127   VecGetArray_Fast(bb,b);
1128   VecGetArray_Fast(xx,x);
1129   tmp  = a->solve_work;
1130 
1131   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1132   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1133 
1134   /* forward solve the lower triangular */
1135   idx    = 2*(*r++);
1136   tmp[0] = b[idx]; tmp[1] = b[1+idx];
1137   for ( i=1; i<n; i++ ) {
1138     v     = aa + 4*ai[i];
1139     vi    = aj + ai[i];
1140     nz    = a->diag[i] - ai[i];
1141     idx   = 2*(*r++);
1142     sum1  = b[idx]; sum2 = b[1+idx];
1143     while (nz--) {
1144       idx   = 2*(*vi++);
1145       x1    = tmp[idx]; x2 = tmp[1+idx];
1146       sum1 -= v[0]*x1 + v[2]*x2;
1147       sum2 -= v[1]*x1 + v[3]*x2;
1148       v += 4;
1149     }
1150     idx = 2*i;
1151     tmp[idx] = sum1; tmp[1+idx] = sum2;
1152   }
1153   /* backward solve the upper triangular */
1154   for ( i=n-1; i>=0; i-- ){
1155     v    = aa + 4*a->diag[i] + 4;
1156     vi   = aj + a->diag[i] + 1;
1157     nz   = ai[i+1] - a->diag[i] - 1;
1158     idt  = 2*i;
1159     sum1 = tmp[idt]; sum2 = tmp[1+idt];
1160     while (nz--) {
1161       idx   = 2*(*vi++);
1162       x1    = tmp[idx]; x2 = tmp[1+idx];
1163       sum1 -= v[0]*x1 + v[2]*x2;
1164       sum2 -= v[1]*x1 + v[3]*x2;
1165       v += 4;
1166     }
1167     idc = 2*(*c--);
1168     v   = aa + 4*a->diag[i];
1169     x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
1170     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
1174   VecRestoreArray_Fast(bb,b);
1175   VecRestoreArray_Fast(xx,x);
1176   PLogFlops(2*4*(a->nz) - a->n);
1177   return 0;
1178 }
1179 
1180 
1181 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1182 {
1183   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1184   IS              iscol=a->col,isrow=a->row;
1185   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
1186   Scalar          *aa=a->a,sum1;
1187   register Scalar *x,*b,*tmp,*v;
1188 
1189   VecGetArray_Fast(bb,b);
1190   VecGetArray_Fast(xx,x);
1191   tmp  = a->solve_work;
1192 
1193   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1194   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1195 
1196   /* forward solve the lower triangular */
1197   tmp[0] = b[*r++];
1198   for ( i=1; i<n; i++ ) {
1199     v     = aa + ai[i];
1200     vi    = aj + ai[i];
1201     nz    = a->diag[i] - ai[i];
1202     sum1  = b[*r++];
1203     while (nz--) {
1204       sum1 -= (*v++)*tmp[*vi++];
1205     }
1206     tmp[i] = sum1;
1207   }
1208   /* backward solve the upper triangular */
1209   for ( i=n-1; i>=0; i-- ){
1210     v    = aa + a->diag[i] + 1;
1211     vi   = aj + a->diag[i] + 1;
1212     nz   = ai[i+1] - a->diag[i] - 1;
1213     sum1 = tmp[i];
1214     while (nz--) {
1215       sum1 -= (*v++)*tmp[*vi++];
1216     }
1217     x[*c--] = tmp[i] = aa[a->diag[i]]*sum1;
1218   }
1219 
1220   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
1221   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
1222   VecRestoreArray_Fast(bb,b);
1223   VecRestoreArray_Fast(xx,x);
1224   PLogFlops(2*1*(a->nz) - a->n);
1225   return 0;
1226 }
1227 
1228 /* ----------------------------------------------------------------*/
1229 /*
1230      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
1231    except that the data structure of Mat_SeqAIJ is slightly different.
1232    Not a good example of code reuse.
1233 */
1234 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
1235                                  Mat *fact)
1236 {
1237   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
1238   IS          isicol;
1239   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
1240   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
1241   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
1242   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2;
1243   PetscTruth  col_identity, row_identity;
1244 
1245   /* special case that simply copies fill pattern */
1246   PetscValidHeaderSpecific(isrow,IS_COOKIE);
1247   PetscValidHeaderSpecific(iscol,IS_COOKIE);
1248   ISIdentity(isrow,&row_identity); ISIdentity(iscol,&col_identity);
1249   if (levels == 0 && row_identity && col_identity) {
1250     ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr);
1251     (*fact)->factor = FACTOR_LU;
1252     b               = (Mat_SeqBAIJ *) (*fact)->data;
1253     if (!b->diag) {
1254       ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
1255     }
1256     b->row        = isrow;
1257     b->col        = iscol;
1258     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
1259     return 0;
1260   }
1261 
1262   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
1263   ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
1264   ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
1265 
1266   /* get new row pointers */
1267   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
1268   ainew[0] = 0;
1269   /* don't know how many column pointers are needed so estimate */
1270   jmax = (int) (f*ai[n] + 1);
1271   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
1272   /* ajfill is level of fill for each fill entry */
1273   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
1274   /* fill is a linked list of nonzeros in active row */
1275   fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
1276   /* im is level for each filled value */
1277   im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
1278   /* dloc is location of diagonal in factor */
1279   dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
1280   dloc[0]  = 0;
1281   for ( prow=0; prow<n; prow++ ) {
1282     /* first copy previous fill into linked list */
1283     nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
1284     xi      = aj + ai[r[prow]];
1285     fill[n] = n;
1286     while (nz--) {
1287       fm  = n;
1288       idx = ic[*xi++];
1289       do {
1290         m  = fm;
1291         fm = fill[m];
1292       } while (fm < idx);
1293       fill[m]   = idx;
1294       fill[idx] = fm;
1295       im[idx]   = 0;
1296     }
1297     nzi = 0;
1298     row = fill[n];
1299     while ( row < prow ) {
1300       incrlev = im[row] + 1;
1301       nz      = dloc[row];
1302       xi      = ajnew  + ainew[row] + nz;
1303       flev    = ajfill + ainew[row] + nz + 1;
1304       nnz     = ainew[row+1] - ainew[row] - nz - 1;
1305       if (*xi++ != row) {
1306         SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot");
1307       }
1308       fm      = row;
1309       while (nnz-- > 0) {
1310         idx = *xi++;
1311         if (*flev + incrlev > levels) {
1312           flev++;
1313           continue;
1314         }
1315         do {
1316           m  = fm;
1317           fm = fill[m];
1318         } while (fm < idx);
1319         if (fm != idx) {
1320           im[idx]   = *flev + incrlev;
1321           fill[m]   = idx;
1322           fill[idx] = fm;
1323           fm        = idx;
1324           nzf++;
1325         }
1326         else {
1327           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
1328         }
1329         flev++;
1330       }
1331       row = fill[row];
1332       nzi++;
1333     }
1334     /* copy new filled row into permanent storage */
1335     ainew[prow+1] = ainew[prow] + nzf;
1336     if (ainew[prow+1] > jmax) {
1337       /* allocate a longer ajnew */
1338       int maxadd;
1339       maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n);
1340       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
1341       jmax += maxadd;
1342       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
1343       PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
1344       PetscFree(ajnew);
1345       ajnew = xi;
1346       /* allocate a longer ajfill */
1347       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
1348       PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
1349       PetscFree(ajfill);
1350       ajfill = xi;
1351       realloc++;
1352     }
1353     xi          = ajnew + ainew[prow];
1354     flev        = ajfill + ainew[prow];
1355     dloc[prow]  = nzi;
1356     fm          = fill[n];
1357     while (nzf--) {
1358       *xi++   = fm;
1359       *flev++ = im[fm];
1360       fm      = fill[fm];
1361     }
1362   }
1363   PetscFree(ajfill);
1364   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
1365   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
1366   ierr = ISDestroy(isicol); CHKERRQ(ierr);
1367   PetscFree(fill); PetscFree(im);
1368 
1369   PLogInfo(A,
1370     "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n",
1371                              realloc,f,((double)ainew[n])/((double)ai[prow]));
1372 
1373   /* put together the new matrix */
1374   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
1375   b = (Mat_SeqBAIJ *) (*fact)->data;
1376   PetscFree(b->imax);
1377   b->singlemalloc = 0;
1378   len = bs2*ainew[n]*sizeof(Scalar);
1379   /* the next line frees the default space generated by the Create() */
1380   PetscFree(b->a); PetscFree(b->ilen);
1381   b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
1382   b->j          = ajnew;
1383   b->i          = ainew;
1384   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
1385   b->diag       = dloc;
1386   b->ilen       = 0;
1387   b->imax       = 0;
1388   b->row        = isrow;
1389   b->col        = iscol;
1390   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
1391   CHKPTRQ(b->solve_work);
1392   /* In b structure:  Free imax, ilen, old a, old j.
1393      Allocate dloc, solve_work, new a, new j */
1394   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
1395   b->maxnz          = b->nz = ainew[n];
1396   (*fact)->factor   = FACTOR_LU;
1397 
1398   (*fact)->info.factor_mallocs    = realloc;
1399   (*fact)->info.fill_ratio_given  = f;
1400   (*fact)->info.fill_ratio_needed = ((double)ainew[n])/((double)ai[prow]);
1401 
1402   return 0;
1403 }
1404 
1405 
1406 
1407 
1408