xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision b2b2dd246975d7f9c8a1571def503d28e659d8b1)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 2 by 2
12 */
13 /* MatLUFactorNumeric_SeqBAIJ_2_newdatastruct -
14      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
15        Kernel_A_gets_A_times_B()
16        Kernel_A_gets_A_minus_B_times_C()
17        Kernel_A_gets_inverse_A()
18 */
19 #undef __FUNCT__
20 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_newdatastruct"
21 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
22 {
23   Mat            C=B;
24   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
25   IS             isrow = b->row,isicol = b->icol;
26   PetscErrorCode ierr;
27   const PetscInt *r,*ic,*ics;
28   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
29   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
30   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
31   PetscInt       bs2 = a->bs2,flg;
32   PetscReal      shift = info->shiftinblocks;
33 
34   PetscFunctionBegin;
35   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
36   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
37 
38   /* generate work space needed by the factorization */
39   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
40   mwork = rtmp + bs2*n;
41   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
42   ics  = ic;
43 
44   for (i=0; i<n; i++){
45     /* zero rtmp */
46     /* L part */
47     nz    = bi[i+1] - bi[i];
48     bjtmp = bj + bi[i];
49     for  (j=0; j<nz; j++){
50       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51     }
52 
53     /* U part */
54     nz = bi[2*n-i+1] - bi[2*n-i];
55     bjtmp = bj + bi[2*n-i];
56     for  (j=0; j<nz; j++){
57       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
58     }
59 
60     /* load in initial (unfactored row) */
61     nz    = ai[r[i]+1] - ai[r[i]];
62     ajtmp = aj + ai[r[i]];
63     v     = aa + bs2*ai[r[i]];
64     for (j=0; j<nz; j++) {
65       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
66     }
67 
68     /* elimination */
69     bjtmp = bj + bi[i];
70     nzL   = bi[i+1] - bi[i];
71     for(k=0;k < nzL;k++) {
72       row = bjtmp[k];
73       pc = rtmp + bs2*row;
74       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
75       if (flg) {
76         pv = b->a + bs2*bdiag[row];
77         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
78         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
79 
80         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
81         pv = b->a + bs2*bi[2*n-row];
82         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
83         for (j=0; j<nz; j++) {
84           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
85           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
86           v    = rtmp + 4*pj[j];
87           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
88           pv  += 4;
89         }
90         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
91       }
92     }
93 
94     /* finished row so stick it into b->a */
95     /* L part */
96     pv   = b->a + bs2*bi[i] ;
97     pj   = b->j + bi[i] ;
98     nz   = bi[i+1] - bi[i];
99     for (j=0; j<nz; j++) {
100       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
101     }
102 
103     /* Mark diagonal and invert diagonal for simplier triangular solves */
104     pv   = b->a + bs2*bdiag[i];
105     pj   = b->j + bdiag[i];
106     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
107     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
108     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
109 
110     /* U part */
111     pv = b->a + bs2*bi[2*n-i];
112     pj = b->j + bi[2*n-i];
113     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
114     for (j=0; j<nz; j++){
115       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
116     }
117   }
118 
119   ierr = PetscFree(rtmp);CHKERRQ(ierr);
120   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
121   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
122 
123   C->assembled = PETSC_TRUE;
124   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
125   PetscFunctionReturn(0);
126 }
127 
128 /*
129   MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct -
130     copied from MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(), then remove ordering
131 */
132 #undef __FUNCT__
133 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct"
134 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
135 {
136   Mat            C=B;
137   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
138   PetscErrorCode ierr;
139   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
140   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
141   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
142   PetscInt       bs2 = a->bs2,flg;
143   PetscReal      shift = info->shiftinblocks;
144 
145   PetscFunctionBegin;
146   /* generate work space needed by the factorization */
147   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
148   mwork = rtmp + bs2*n;
149   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
150 
151   for (i=0; i<n; i++){
152     /* zero rtmp */
153     /* L part */
154     nz    = bi[i+1] - bi[i];
155     bjtmp = bj + bi[i];
156     for  (j=0; j<nz; j++){
157       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
158     }
159 
160     /* U part */
161     nz = bi[2*n-i+1] - bi[2*n-i];
162     bjtmp = bj + bi[2*n-i];
163     for  (j=0; j<nz; j++){
164       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
165     }
166 
167     /* load in initial (unfactored row) */
168     nz    = ai[i+1] - ai[i];
169     ajtmp = aj + ai[i];
170     v     = aa + bs2*ai[i];
171     for (j=0; j<nz; j++) {
172       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
173     }
174 
175     /* elimination */
176     bjtmp = bj + bi[i];
177     nzL   = bi[i+1] - bi[i];
178     for(k=0;k < nzL;k++) {
179       row = bjtmp[k];
180       pc = rtmp + bs2*row;
181       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
182       if (flg) {
183         pv = b->a + bs2*bdiag[row];
184         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
185         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
186 
187         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
188         pv = b->a + bs2*bi[2*n-row];
189         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
190         for (j=0; j<nz; j++) {
191           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
192           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
193           v    = rtmp + 4*pj[j];
194           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
195           pv  += 4;
196         }
197         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
198       }
199     }
200 
201     /* finished row so stick it into b->a */
202     /* L part */
203     pv   = b->a + bs2*bi[i] ;
204     pj   = b->j + bi[i] ;
205     nz   = bi[i+1] - bi[i];
206     for (j=0; j<nz; j++) {
207       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
208     }
209 
210     /* Mark diagonal and invert diagonal for simplier triangular solves */
211     pv   = b->a + bs2*bdiag[i];
212     pj   = b->j + bdiag[i];
213     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
214     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
215     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
216 
217     /* U part */
218     pv = b->a + bs2*bi[2*n-i];
219     pj = b->j + bi[2*n-i];
220     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
221     for (j=0; j<nz; j++){
222       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
223     }
224   }
225 
226   ierr = PetscFree(rtmp);CHKERRQ(ierr);
227   C->assembled = PETSC_TRUE;
228   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
234 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
235 {
236   Mat            C=B;
237   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
238   PetscErrorCode ierr;
239   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
240   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
241   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
242   PetscInt       bs2 = a->bs2,flg;
243   PetscReal      shift = info->shiftinblocks;
244 
245   PetscFunctionBegin;
246   /* generate work space needed by the factorization */
247   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
248   mwork = rtmp + bs2*n;
249   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
250 
251   for (i=0; i<n; i++){
252     /* zero rtmp */
253     /* L part */
254     nz    = bi[i+1] - bi[i];
255     bjtmp = bj + bi[i];
256     for  (j=0; j<nz; j++){
257       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
258     }
259 
260     /* U part */
261     nz = bdiag[i] - bdiag[i+1];
262     bjtmp = bj + bdiag[i+1]+1;
263     for  (j=0; j<nz; j++){
264       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
265     }
266 
267     /* load in initial (unfactored row) */
268     nz    = ai[i+1] - ai[i];
269     ajtmp = aj + ai[i];
270     v     = aa + bs2*ai[i];
271     for (j=0; j<nz; j++) {
272       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
273     }
274 
275     /* elimination */
276     bjtmp = bj + bi[i];
277     nzL   = bi[i+1] - bi[i];
278     for(k=0;k < nzL;k++) {
279       row = bjtmp[k];
280       pc = rtmp + bs2*row;
281       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
282       if (flg) {
283         pv = b->a + bs2*bdiag[row];
284         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
285         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
286 
287         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
288 	pv = b->a + bs2*(bdiag[row+1]+1);
289 	nz = bdiag[row]-bdiag[row+1] - 1; /* num of entries in U(row,:) excluding diag */
290         for (j=0; j<nz; j++) {
291           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
292           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
293           v    = rtmp + 4*pj[j];
294           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
295           pv  += 4;
296         }
297         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
298       }
299     }
300 
301     /* finished row so stick it into b->a */
302     /* L part */
303     pv   = b->a + bs2*bi[i] ;
304     pj   = b->j + bi[i] ;
305     nz   = bi[i+1] - bi[i];
306     for (j=0; j<nz; j++) {
307       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
308     }
309 
310     /* Mark diagonal and invert diagonal for simplier triangular solves */
311     pv   = b->a + bs2*bdiag[i];
312     pj   = b->j + bdiag[i];
313     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
314     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
315     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
316 
317     /* U part */
318     /*
319     pv = b->a + bs2*bi[2*n-i];
320     pj = b->j + bi[2*n-i];
321     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
322     */
323     pv = b->a + bs2*(bdiag[i+1]+1);
324     pj = b->j + bdiag[i+1]+1;
325     nz = bdiag[i] - bdiag[i+1] - 1;
326     for (j=0; j<nz; j++){
327       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
328     }
329   }
330 
331   ierr = PetscFree(rtmp);CHKERRQ(ierr);
332   C->assembled = PETSC_TRUE;
333   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
334   PetscFunctionReturn(0);
335 }
336 
337 #undef __FUNCT__
338 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2"
339 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B,Mat A,const MatFactorInfo *info)
340 {
341   Mat            C = B;
342   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
343   IS             isrow = b->row,isicol = b->icol;
344   PetscErrorCode ierr;
345   const PetscInt *r,*ic;
346   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
347   PetscInt       *ajtmpold,*ajtmp,nz,row;
348   PetscInt       *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj;
349   MatScalar      *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4;
350   MatScalar      p1,p2,p3,p4;
351   MatScalar      *ba = b->a,*aa = a->a;
352   PetscReal      shift = info->shiftinblocks;
353 
354   PetscFunctionBegin;
355   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
356   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
357   ierr  = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
358 
359   for (i=0; i<n; i++) {
360     nz    = bi[i+1] - bi[i];
361     ajtmp = bj + bi[i];
362     for  (j=0; j<nz; j++) {
363       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
364     }
365     /* load in initial (unfactored row) */
366     idx      = r[i];
367     nz       = ai[idx+1] - ai[idx];
368     ajtmpold = aj + ai[idx];
369     v        = aa + 4*ai[idx];
370     for (j=0; j<nz; j++) {
371       x    = rtmp+4*ic[ajtmpold[j]];
372       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
373       v    += 4;
374     }
375     row = *ajtmp++;
376     while (row < i) {
377       pc = rtmp + 4*row;
378       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
379       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
380         pv = ba + 4*diag_offset[row];
381         pj = bj + diag_offset[row] + 1;
382         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
383         pc[0] = m1 = p1*x1 + p3*x2;
384         pc[1] = m2 = p2*x1 + p4*x2;
385         pc[2] = m3 = p1*x3 + p3*x4;
386         pc[3] = m4 = p2*x3 + p4*x4;
387         nz = bi[row+1] - diag_offset[row] - 1;
388         pv += 4;
389         for (j=0; j<nz; j++) {
390           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
391           x    = rtmp + 4*pj[j];
392           x[0] -= m1*x1 + m3*x2;
393           x[1] -= m2*x1 + m4*x2;
394           x[2] -= m1*x3 + m3*x4;
395           x[3] -= m2*x3 + m4*x4;
396           pv   += 4;
397         }
398         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
399       }
400       row = *ajtmp++;
401     }
402     /* finished row so stick it into b->a */
403     pv = ba + 4*bi[i];
404     pj = bj + bi[i];
405     nz = bi[i+1] - bi[i];
406     for (j=0; j<nz; j++) {
407       x     = rtmp+4*pj[j];
408       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
409       pv   += 4;
410     }
411     /* invert diagonal block */
412     w = ba + 4*diag_offset[i];
413     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
414   }
415 
416   ierr = PetscFree(rtmp);CHKERRQ(ierr);
417   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
418   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
419   C->ops->solve          = MatSolve_SeqBAIJ_2;
420   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
421   C->assembled = PETSC_TRUE;
422   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
423   PetscFunctionReturn(0);
424 }
425 /*
426       Version for when blocks are 2 by 2 Using natural ordering
427 */
428 #undef __FUNCT__
429 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering"
430 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
431 {
432   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
433   PetscErrorCode ierr;
434   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
435   PetscInt       *ajtmpold,*ajtmp,nz,row;
436   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
437   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
438   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4;
439   MatScalar      *ba = b->a,*aa = a->a;
440   PetscReal      shift = info->shiftinblocks;
441 
442   PetscFunctionBegin;
443   ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
444   for (i=0; i<n; i++) {
445     nz    = bi[i+1] - bi[i];
446     ajtmp = bj + bi[i];
447     for  (j=0; j<nz; j++) {
448       x = rtmp+4*ajtmp[j];
449       x[0]  = x[1]  = x[2]  = x[3]  = 0.0;
450     }
451     /* load in initial (unfactored row) */
452     nz       = ai[i+1] - ai[i];
453     ajtmpold = aj + ai[i];
454     v        = aa + 4*ai[i];
455     for (j=0; j<nz; j++) {
456       x    = rtmp+4*ajtmpold[j];
457       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
458       v    += 4;
459     }
460     row = *ajtmp++;
461     while (row < i) {
462       pc  = rtmp + 4*row;
463       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
464       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
465         pv = ba + 4*diag_offset[row];
466         pj = bj + diag_offset[row] + 1;
467         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
468         pc[0] = m1 = p1*x1 + p3*x2;
469         pc[1] = m2 = p2*x1 + p4*x2;
470         pc[2] = m3 = p1*x3 + p3*x4;
471         pc[3] = m4 = p2*x3 + p4*x4;
472         nz = bi[row+1] - diag_offset[row] - 1;
473         pv += 4;
474         for (j=0; j<nz; j++) {
475           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
476           x    = rtmp + 4*pj[j];
477           x[0] -= m1*x1 + m3*x2;
478           x[1] -= m2*x1 + m4*x2;
479           x[2] -= m1*x3 + m3*x4;
480           x[3] -= m2*x3 + m4*x4;
481           pv   += 4;
482         }
483         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
484       }
485       row = *ajtmp++;
486     }
487     /* finished row so stick it into b->a */
488     pv = ba + 4*bi[i];
489     pj = bj + bi[i];
490     nz = bi[i+1] - bi[i];
491     for (j=0; j<nz; j++) {
492       x      = rtmp+4*pj[j];
493       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
494       /*
495       printf(" col %d:",pj[j]);
496       PetscInt j1;
497       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
498       printf("\n");
499       */
500       pv   += 4;
501     }
502     /* invert diagonal block */
503     w = ba + 4*diag_offset[i];
504     /*
505     printf(" \n%d -th: diag: ",i);
506     for (j=0; j<4; j++){
507       printf(" %g,",w[j]);
508     }
509     printf("\n----------------------------\n");
510     */
511     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
512   }
513 
514   ierr = PetscFree(rtmp);CHKERRQ(ierr);
515   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
516   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
517   C->assembled = PETSC_TRUE;
518   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
519   PetscFunctionReturn(0);
520 }
521 
522 /* ----------------------------------------------------------- */
523 /*
524      Version for when blocks are 1 by 1.
525 */
526 #undef __FUNCT__
527 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_1"
528 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat C,Mat A,const MatFactorInfo *info)
529 {
530   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
531   IS             isrow = b->row,isicol = b->icol;
532   PetscErrorCode ierr;
533   const PetscInt *r,*ic;
534   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
535   PetscInt       *ajtmpold,*ajtmp,nz,row,*ai = a->i,*aj = a->j;
536   PetscInt       *diag_offset = b->diag,diag,*pj;
537   MatScalar      *pv,*v,*rtmp,multiplier,*pc;
538   MatScalar      *ba = b->a,*aa = a->a;
539   PetscTruth     row_identity, col_identity;
540 
541   PetscFunctionBegin;
542   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
543   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
544   ierr  = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
545 
546   for (i=0; i<n; i++) {
547     nz    = bi[i+1] - bi[i];
548     ajtmp = bj + bi[i];
549     for  (j=0; j<nz; j++) rtmp[ajtmp[j]] = 0.0;
550 
551     /* load in initial (unfactored row) */
552     nz       = ai[r[i]+1] - ai[r[i]];
553     ajtmpold = aj + ai[r[i]];
554     v        = aa + ai[r[i]];
555     for (j=0; j<nz; j++) rtmp[ic[ajtmpold[j]]] =  v[j];
556 
557     row = *ajtmp++;
558     while (row < i) {
559       pc = rtmp + row;
560       if (*pc != 0.0) {
561         pv         = ba + diag_offset[row];
562         pj         = bj + diag_offset[row] + 1;
563         multiplier = *pc * *pv++;
564         *pc        = multiplier;
565         nz         = bi[row+1] - diag_offset[row] - 1;
566         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
567         ierr = PetscLogFlops(1.0+2.0*nz);CHKERRQ(ierr);
568       }
569       row = *ajtmp++;
570     }
571     /* finished row so stick it into b->a */
572     pv = ba + bi[i];
573     pj = bj + bi[i];
574     nz = bi[i+1] - bi[i];
575     for (j=0; j<nz; j++) {pv[j] = rtmp[pj[j]];}
576     diag = diag_offset[i] - bi[i];
577     /* check pivot entry for current row */
578     if (pv[diag] == 0.0) {
579       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot: row in original ordering %D in permuted ordering %D",r[i],i);
580     }
581     pv[diag] = 1.0/pv[diag];
582   }
583 
584   ierr = PetscFree(rtmp);CHKERRQ(ierr);
585   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
586   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
587   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
588   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
589   if (row_identity && col_identity) {
590     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
591     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
592   } else {
593     C->ops->solve          = MatSolve_SeqBAIJ_1;
594     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
595   }
596   C->assembled = PETSC_TRUE;
597   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
598   PetscFunctionReturn(0);
599 }
600 
601 EXTERN_C_BEGIN
602 #undef __FUNCT__
603 #define __FUNCT__ "MatGetFactor_seqbaij_petsc"
604 PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A,MatFactorType ftype,Mat *B)
605 {
606   PetscInt           n = A->rmap->n;
607   PetscErrorCode     ierr;
608 
609   PetscFunctionBegin;
610   ierr = MatCreate(((PetscObject)A)->comm,B);CHKERRQ(ierr);
611   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
612   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
613     ierr = MatSetType(*B,MATSEQBAIJ);CHKERRQ(ierr);
614     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
615     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
616     (*B)->ops->iludtfactor       = MatILUDTFactor_SeqBAIJ;
617   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
618     ierr = MatSetType(*B,MATSEQSBAIJ);CHKERRQ(ierr);
619     ierr = MatSeqSBAIJSetPreallocation(*B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
620     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
621     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
622   } else SETERRQ(PETSC_ERR_SUP,"Factor type not supported");
623   (*B)->factor = ftype;
624   PetscFunctionReturn(0);
625 }
626 EXTERN_C_END
627 
628 EXTERN_C_BEGIN
629 #undef __FUNCT__
630 #define __FUNCT__ "MatGetFactorAvailable_seqaij_petsc"
631 PetscErrorCode MatGetFactorAvailable_seqbaij_petsc(Mat A,MatFactorType ftype,PetscTruth *flg)
632 {
633   PetscFunctionBegin;
634   *flg = PETSC_TRUE;
635   PetscFunctionReturn(0);
636 }
637 EXTERN_C_END
638 
639 /* ----------------------------------------------------------- */
640 #undef __FUNCT__
641 #define __FUNCT__ "MatLUFactor_SeqBAIJ"
642 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,const MatFactorInfo *info)
643 {
644   PetscErrorCode ierr;
645   Mat            C;
646 
647   PetscFunctionBegin;
648   ierr = MatGetFactor(A,MAT_SOLVER_PETSC,MAT_FACTOR_LU,&C);CHKERRQ(ierr);
649   ierr = MatLUFactorSymbolic(C,A,row,col,info);CHKERRQ(ierr);
650   ierr = MatLUFactorNumeric(C,A,info);CHKERRQ(ierr);
651   A->ops->solve            = C->ops->solve;
652   A->ops->solvetranspose   = C->ops->solvetranspose;
653   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
654   ierr = PetscLogObjectParent(A,((Mat_SeqBAIJ*)(A->data))->icol);CHKERRQ(ierr);
655   PetscFunctionReturn(0);
656 }
657 
658 #include "../src/mat/impls/sbaij/seq/sbaij.h"
659 #undef __FUNCT__
660 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N"
661 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
662 {
663   PetscErrorCode ierr;
664   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
665   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
666   IS             ip=b->row;
667   const PetscInt *rip;
668   PetscInt       i,j,mbs=a->mbs,bs=A->rmap->bs,*bi=b->i,*bj=b->j,*bcol;
669   PetscInt       *ai=a->i,*aj=a->j;
670   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
671   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
672   PetscReal      zeropivot,rs,shiftnz;
673   PetscReal      shiftpd;
674   ChShift_Ctx    sctx;
675   PetscInt       newshift;
676 
677   PetscFunctionBegin;
678   if (bs > 1) {
679     if (!a->sbaijMat){
680       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
681     }
682     ierr = (a->sbaijMat)->ops->choleskyfactornumeric(C,a->sbaijMat,info);CHKERRQ(ierr);
683     ierr = MatDestroy(a->sbaijMat);CHKERRQ(ierr);
684     a->sbaijMat = PETSC_NULL;
685     PetscFunctionReturn(0);
686   }
687 
688   /* initialization */
689   shiftnz   = info->shiftnz;
690   shiftpd   = info->shiftpd;
691   zeropivot = info->zeropivot;
692 
693   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
694   nz   = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
695   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
696   jl   = il + mbs;
697   rtmp = (MatScalar*)(jl + mbs);
698 
699   sctx.shift_amount = 0;
700   sctx.nshift       = 0;
701   do {
702     sctx.chshift = PETSC_FALSE;
703     for (i=0; i<mbs; i++) {
704       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
705     }
706 
707     for (k = 0; k<mbs; k++){
708       bval = ba + bi[k];
709       /* initialize k-th row by the perm[k]-th row of A */
710       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
711       for (j = jmin; j < jmax; j++){
712         col = rip[aj[j]];
713         if (col >= k){ /* only take upper triangular entry */
714           rtmp[col] = aa[j];
715           *bval++  = 0.0; /* for in-place factorization */
716         }
717       }
718 
719       /* shift the diagonal of the matrix */
720       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
721 
722       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
723       dk = rtmp[k];
724       i = jl[k]; /* first row to be added to k_th row  */
725 
726       while (i < k){
727         nexti = jl[i]; /* next row to be added to k_th row */
728 
729         /* compute multiplier, update diag(k) and U(i,k) */
730         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
731         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
732         dk += uikdi*ba[ili];
733         ba[ili] = uikdi; /* -U(i,k) */
734 
735         /* add multiple of row i to k-th row */
736         jmin = ili + 1; jmax = bi[i+1];
737         if (jmin < jmax){
738           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
739           /* update il and jl for row i */
740           il[i] = jmin;
741           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
742         }
743         i = nexti;
744       }
745 
746       /* shift the diagonals when zero pivot is detected */
747       /* compute rs=sum of abs(off-diagonal) */
748       rs   = 0.0;
749       jmin = bi[k]+1;
750       nz   = bi[k+1] - jmin;
751       if (nz){
752         bcol = bj + jmin;
753         while (nz--){
754           rs += PetscAbsScalar(rtmp[*bcol]);
755           bcol++;
756         }
757       }
758 
759       sctx.rs = rs;
760       sctx.pv = dk;
761       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
762       if (newshift == 1) break;
763 
764       /* copy data into U(k,:) */
765       ba[bi[k]] = 1.0/dk; /* U(k,k) */
766       jmin = bi[k]+1; jmax = bi[k+1];
767       if (jmin < jmax) {
768         for (j=jmin; j<jmax; j++){
769           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
770         }
771         /* add the k-th row into il and jl */
772         il[k] = jmin;
773         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
774       }
775     }
776   } while (sctx.chshift);
777   ierr = PetscFree(il);CHKERRQ(ierr);
778 
779   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
780   C->assembled    = PETSC_TRUE;
781   C->preallocated = PETSC_TRUE;
782   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
783   if (sctx.nshift){
784     if (shiftpd) {
785       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
786     } else if (shiftnz) {
787       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
788     }
789   }
790   PetscFunctionReturn(0);
791 }
792 
793 #undef __FUNCT__
794 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering"
795 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
796 {
797   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
798   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
799   PetscErrorCode ierr;
800   PetscInt       i,j,am=a->mbs;
801   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
802   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
803   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
804   PetscReal      zeropivot,rs,shiftnz;
805   PetscReal      shiftpd;
806   ChShift_Ctx    sctx;
807   PetscInt       newshift;
808 
809   PetscFunctionBegin;
810   /* initialization */
811   shiftnz   = info->shiftnz;
812   shiftpd   = info->shiftpd;
813   zeropivot = info->zeropivot;
814 
815   nz   = (2*am+1)*sizeof(PetscInt)+am*sizeof(MatScalar);
816   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
817   jl   = il + am;
818   rtmp = (MatScalar*)(jl + am);
819 
820   sctx.shift_amount = 0;
821   sctx.nshift       = 0;
822   do {
823     sctx.chshift = PETSC_FALSE;
824     for (i=0; i<am; i++) {
825       rtmp[i] = 0.0; jl[i] = am; il[0] = 0;
826     }
827 
828     for (k = 0; k<am; k++){
829     /* initialize k-th row with elements nonzero in row perm(k) of A */
830       nz   = ai[k+1] - ai[k];
831       acol = aj + ai[k];
832       aval = aa + ai[k];
833       bval = ba + bi[k];
834       while (nz -- ){
835         if (*acol < k) { /* skip lower triangular entries */
836           acol++; aval++;
837         } else {
838           rtmp[*acol++] = *aval++;
839           *bval++       = 0.0; /* for in-place factorization */
840         }
841       }
842 
843       /* shift the diagonal of the matrix */
844       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
845 
846       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
847       dk = rtmp[k];
848       i  = jl[k]; /* first row to be added to k_th row  */
849 
850       while (i < k){
851         nexti = jl[i]; /* next row to be added to k_th row */
852         /* compute multiplier, update D(k) and U(i,k) */
853         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
854         uikdi = - ba[ili]*ba[bi[i]];
855         dk   += uikdi*ba[ili];
856         ba[ili] = uikdi; /* -U(i,k) */
857 
858         /* add multiple of row i to k-th row ... */
859         jmin = ili + 1;
860         nz   = bi[i+1] - jmin;
861         if (nz > 0){
862           bcol = bj + jmin;
863           bval = ba + jmin;
864           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
865           /* update il and jl for i-th row */
866           il[i] = jmin;
867           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
868         }
869         i = nexti;
870       }
871 
872       /* shift the diagonals when zero pivot is detected */
873       /* compute rs=sum of abs(off-diagonal) */
874       rs   = 0.0;
875       jmin = bi[k]+1;
876       nz   = bi[k+1] - jmin;
877       if (nz){
878         bcol = bj + jmin;
879         while (nz--){
880           rs += PetscAbsScalar(rtmp[*bcol]);
881           bcol++;
882         }
883       }
884 
885       sctx.rs = rs;
886       sctx.pv = dk;
887       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
888       if (newshift == 1) break;    /* sctx.shift_amount is updated */
889 
890       /* copy data into U(k,:) */
891       ba[bi[k]] = 1.0/dk;
892       jmin      = bi[k]+1;
893       nz        = bi[k+1] - jmin;
894       if (nz){
895         bcol = bj + jmin;
896         bval = ba + jmin;
897         while (nz--){
898           *bval++       = rtmp[*bcol];
899           rtmp[*bcol++] = 0.0;
900         }
901         /* add k-th row into il and jl */
902         il[k] = jmin;
903         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
904       }
905     }
906   } while (sctx.chshift);
907   ierr = PetscFree(il);CHKERRQ(ierr);
908 
909   C->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
910   C->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering;
911   C->assembled    = PETSC_TRUE;
912   C->preallocated = PETSC_TRUE;
913   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
914     if (sctx.nshift){
915     if (shiftnz) {
916       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
917     } else if (shiftpd) {
918       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
919     }
920   }
921   PetscFunctionReturn(0);
922 }
923 
924 #include "petscbt.h"
925 #include "../src/mat/utils/freespace.h"
926 #undef __FUNCT__
927 #define __FUNCT__ "MatICCFactorSymbolic_SeqBAIJ"
928 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
929 {
930   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
931   Mat_SeqSBAIJ       *b;
932   Mat                B;
933   PetscErrorCode     ierr;
934   PetscTruth         perm_identity;
935   PetscInt           reallocs=0,i,*ai=a->i,*aj=a->j,am=a->mbs,bs=A->rmap->bs,*ui;
936   const PetscInt     *rip;
937   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
938   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,ncols_upper,*cols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
939   PetscReal          fill=info->fill,levels=info->levels;
940   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
941   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
942   PetscBT            lnkbt;
943 
944   PetscFunctionBegin;
945   if (bs > 1){
946     if (!a->sbaijMat){
947       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
948     }
949     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ;  /* undue the change made in MatGetFactor_seqbaij_petsc */
950     ierr = MatICCFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
951     PetscFunctionReturn(0);
952   }
953 
954   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
955   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
956 
957   /* special case that simply copies fill pattern */
958   if (!levels && perm_identity) {
959     ierr = MatMarkDiagonal_SeqBAIJ(A);CHKERRQ(ierr);
960     ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
961     for (i=0; i<am; i++) {
962       ui[i] = ai[i+1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
963     }
964     B = fact;
965     ierr = MatSeqSBAIJSetPreallocation(B,1,0,ui);CHKERRQ(ierr);
966 
967 
968     b  = (Mat_SeqSBAIJ*)B->data;
969     uj = b->j;
970     for (i=0; i<am; i++) {
971       aj = a->j + a->diag[i];
972       for (j=0; j<ui[i]; j++){
973         *uj++ = *aj++;
974       }
975       b->ilen[i] = ui[i];
976     }
977     ierr = PetscFree(ui);CHKERRQ(ierr);
978     B->factor = MAT_FACTOR_NONE;
979     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
980     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
981     B->factor = MAT_FACTOR_ICC;
982 
983     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
984     PetscFunctionReturn(0);
985   }
986 
987   /* initialization */
988   ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
989   ui[0] = 0;
990   ierr  = PetscMalloc((2*am+1)*sizeof(PetscInt),&cols_lvl);CHKERRQ(ierr);
991 
992   /* jl: linked list for storing indices of the pivot rows
993      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
994   ierr = PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
995   il         = jl + am;
996   uj_ptr     = (PetscInt**)(il + am);
997   uj_lvl_ptr = (PetscInt**)(uj_ptr + am);
998   for (i=0; i<am; i++){
999     jl[i] = am; il[i] = 0;
1000   }
1001 
1002   /* create and initialize a linked list for storing column indices of the active row k */
1003   nlnk = am + 1;
1004   ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1005 
1006   /* initial FreeSpace size is fill*(ai[am]+1) */
1007   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
1008   current_space = free_space;
1009   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
1010   current_space_lvl = free_space_lvl;
1011 
1012   for (k=0; k<am; k++){  /* for each active row k */
1013     /* initialize lnk by the column indices of row rip[k] of A */
1014     nzk   = 0;
1015     ncols = ai[rip[k]+1] - ai[rip[k]];
1016     ncols_upper = 0;
1017     cols        = cols_lvl + am;
1018     for (j=0; j<ncols; j++){
1019       i = rip[*(aj + ai[rip[k]] + j)];
1020       if (i >= k){ /* only take upper triangular entry */
1021         cols[ncols_upper] = i;
1022         cols_lvl[ncols_upper] = -1;  /* initialize level for nonzero entries */
1023         ncols_upper++;
1024       }
1025     }
1026     ierr = PetscIncompleteLLAdd(ncols_upper,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1027     nzk += nlnk;
1028 
1029     /* update lnk by computing fill-in for each pivot row to be merged in */
1030     prow = jl[k]; /* 1st pivot row */
1031 
1032     while (prow < k){
1033       nextprow = jl[prow];
1034 
1035       /* merge prow into k-th row */
1036       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
1037       jmax = ui[prow+1];
1038       ncols = jmax-jmin;
1039       i     = jmin - ui[prow];
1040       cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1041       for (j=0; j<ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1042       ierr = PetscIncompleteLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1043       nzk += nlnk;
1044 
1045       /* update il and jl for prow */
1046       if (jmin < jmax){
1047         il[prow] = jmin;
1048         j = *cols; jl[prow] = jl[j]; jl[j] = prow;
1049       }
1050       prow = nextprow;
1051     }
1052 
1053     /* if free space is not available, make more free space */
1054     if (current_space->local_remaining<nzk) {
1055       i = am - k + 1; /* num of unfactored rows */
1056       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1057       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
1058       ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
1059       reallocs++;
1060     }
1061 
1062     /* copy data into free_space and free_space_lvl, then initialize lnk */
1063     ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
1064 
1065     /* add the k-th row into il and jl */
1066     if (nzk-1 > 0){
1067       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1068       jl[k] = jl[i]; jl[i] = k;
1069       il[k] = ui[k] + 1;
1070     }
1071     uj_ptr[k]     = current_space->array;
1072     uj_lvl_ptr[k] = current_space_lvl->array;
1073 
1074     current_space->array           += nzk;
1075     current_space->local_used      += nzk;
1076     current_space->local_remaining -= nzk;
1077 
1078     current_space_lvl->array           += nzk;
1079     current_space_lvl->local_used      += nzk;
1080     current_space_lvl->local_remaining -= nzk;
1081 
1082     ui[k+1] = ui[k] + nzk;
1083   }
1084 
1085 #if defined(PETSC_USE_INFO)
1086   if (ai[am] != 0) {
1087     PetscReal af = ((PetscReal)(2*ui[am]-am))/((PetscReal)ai[am]);
1088     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
1089     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
1090     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
1091   } else {
1092     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
1093   }
1094 #endif
1095 
1096   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1097   ierr = PetscFree(jl);CHKERRQ(ierr);
1098   ierr = PetscFree(cols_lvl);CHKERRQ(ierr);
1099 
1100   /* destroy list of free space and other temporary array(s) */
1101   ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1102   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1103   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1104   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
1105 
1106   /* put together the new matrix in MATSEQSBAIJ format */
1107   B = fact;
1108   ierr = MatSeqSBAIJSetPreallocation(B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1109 
1110   b = (Mat_SeqSBAIJ*)B->data;
1111   b->singlemalloc = PETSC_FALSE;
1112   b->free_a       = PETSC_TRUE;
1113   b->free_ij       = PETSC_TRUE;
1114   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1115   b->j    = uj;
1116   b->i    = ui;
1117   b->diag = 0;
1118   b->ilen = 0;
1119   b->imax = 0;
1120   b->row  = perm;
1121   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1122   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1123   b->icol = perm;
1124   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1125   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1126   ierr    = PetscLogObjectMemory(B,(ui[am]-am)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1127   b->maxnz = b->nz = ui[am];
1128 
1129   B->info.factor_mallocs    = reallocs;
1130   B->info.fill_ratio_given  = fill;
1131   if (ai[am] != 0) {
1132     B->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1133   } else {
1134     B->info.fill_ratio_needed = 0.0;
1135   }
1136   if (perm_identity){
1137     B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1138     B->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1139     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1140   } else {
1141     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1142   }
1143   PetscFunctionReturn(0);
1144 }
1145 
1146 #undef __FUNCT__
1147 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqBAIJ"
1148 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
1149 {
1150   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1151   Mat_SeqSBAIJ       *b;
1152   Mat                B;
1153   PetscErrorCode     ierr;
1154   PetscTruth         perm_identity;
1155   PetscReal          fill = info->fill;
1156   const PetscInt     *rip;
1157   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,*ai=a->i,*aj=a->j,reallocs=0,prow;
1158   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
1159   PetscInt           nlnk,*lnk,ncols,ncols_upper,*cols,*uj,**ui_ptr,*uj_ptr;
1160   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1161   PetscBT            lnkbt;
1162 
1163   PetscFunctionBegin;
1164   if (bs > 1) { /* convert to seqsbaij */
1165     if (!a->sbaijMat){
1166       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
1167     }
1168     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1169     ierr = MatCholeskyFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
1170     PetscFunctionReturn(0);
1171   }
1172 
1173   /* check whether perm is the identity mapping */
1174   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
1175   if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported");
1176   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
1177 
1178   /* initialization */
1179   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
1180   ui[0] = 0;
1181 
1182   /* jl: linked list for storing indices of the pivot rows
1183      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1184   ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
1185   il     = jl + mbs;
1186   cols   = il + mbs;
1187   ui_ptr = (PetscInt**)(cols + mbs);
1188   for (i=0; i<mbs; i++){
1189     jl[i] = mbs; il[i] = 0;
1190   }
1191 
1192   /* create and initialize a linked list for storing column indices of the active row k */
1193   nlnk = mbs + 1;
1194   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1195 
1196   /* initial FreeSpace size is fill*(ai[mbs]+1) */
1197   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
1198   current_space = free_space;
1199 
1200   for (k=0; k<mbs; k++){  /* for each active row k */
1201     /* initialize lnk by the column indices of row rip[k] of A */
1202     nzk   = 0;
1203     ncols = ai[rip[k]+1] - ai[rip[k]];
1204     ncols_upper = 0;
1205     for (j=0; j<ncols; j++){
1206       i = rip[*(aj + ai[rip[k]] + j)];
1207       if (i >= k){ /* only take upper triangular entry */
1208         cols[ncols_upper] = i;
1209         ncols_upper++;
1210       }
1211     }
1212     ierr = PetscLLAdd(ncols_upper,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1213     nzk += nlnk;
1214 
1215     /* update lnk by computing fill-in for each pivot row to be merged in */
1216     prow = jl[k]; /* 1st pivot row */
1217 
1218     while (prow < k){
1219       nextprow = jl[prow];
1220       /* merge prow into k-th row */
1221       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1222       jmax = ui[prow+1];
1223       ncols = jmax-jmin;
1224       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1225       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1226       nzk += nlnk;
1227 
1228       /* update il and jl for prow */
1229       if (jmin < jmax){
1230         il[prow] = jmin;
1231         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
1232       }
1233       prow = nextprow;
1234     }
1235 
1236     /* if free space is not available, make more free space */
1237     if (current_space->local_remaining<nzk) {
1238       i = mbs - k + 1; /* num of unfactored rows */
1239       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1240       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
1241       reallocs++;
1242     }
1243 
1244     /* copy data into free space, then initialize lnk */
1245     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1246 
1247     /* add the k-th row into il and jl */
1248     if (nzk-1 > 0){
1249       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1250       jl[k] = jl[i]; jl[i] = k;
1251       il[k] = ui[k] + 1;
1252     }
1253     ui_ptr[k] = current_space->array;
1254     current_space->array           += nzk;
1255     current_space->local_used      += nzk;
1256     current_space->local_remaining -= nzk;
1257 
1258     ui[k+1] = ui[k] + nzk;
1259   }
1260 
1261 #if defined(PETSC_USE_INFO)
1262   if (ai[mbs] != 0) {
1263     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1264     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
1265     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
1266     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
1267   } else {
1268     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
1269   }
1270 #endif
1271 
1272   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1273   ierr = PetscFree(jl);CHKERRQ(ierr);
1274 
1275   /* destroy list of free space and other temporary array(s) */
1276   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1277   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1278   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1279 
1280   /* put together the new matrix in MATSEQSBAIJ format */
1281   B    = fact;
1282   ierr = MatSeqSBAIJSetPreallocation(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1283 
1284   b = (Mat_SeqSBAIJ*)B->data;
1285   b->singlemalloc = PETSC_FALSE;
1286   b->free_a       = PETSC_TRUE;
1287   b->free_ij      = PETSC_TRUE;
1288   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1289   b->j    = uj;
1290   b->i    = ui;
1291   b->diag = 0;
1292   b->ilen = 0;
1293   b->imax = 0;
1294   b->row  = perm;
1295   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1296   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1297   b->icol = perm;
1298   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1299   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1300   ierr    = PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1301   b->maxnz = b->nz = ui[mbs];
1302 
1303   B->info.factor_mallocs    = reallocs;
1304   B->info.fill_ratio_given  = fill;
1305   if (ai[mbs] != 0) {
1306     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1307   } else {
1308     B->info.fill_ratio_needed = 0.0;
1309   }
1310   if (perm_identity){
1311     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1312   } else {
1313     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1314   }
1315   PetscFunctionReturn(0);
1316 }
1317 
1318 /* --------------------------------------------------------- */
1319 #undef __FUNCT__
1320 #define __FUNCT__ "MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct"
1321 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1322 {
1323   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1324   PetscErrorCode ierr;
1325   const PetscInt *ai=a->i,*aj=a->j,*vi;
1326   PetscInt       i,k,n=a->mbs;
1327   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,*adiag=a->diag;
1328   MatScalar      *aa=a->a,*v;
1329   PetscScalar    *x,*b,*s,*t,*ls;
1330 
1331   PetscFunctionBegin;
1332   /* printf("MatSolve_SeqBAIJ_NaturalOrdering_iludt..bs %d\n",bs); */
1333   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1334   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1335   t  = a->solve_work;
1336 
1337   /* forward solve the lower triangular */
1338   ierr = PetscMemcpy(t,b,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy 1st block of b to t */
1339 
1340   for (i=1; i<n; i++) {
1341     v   = aa + bs2*ai[i];
1342     vi  = aj + ai[i];
1343     nz = ai[i+1] - ai[i];
1344     s = t + bs*i;
1345     ierr = PetscMemcpy(s,b+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy i_th block of b to t */
1346     for(k=0;k<nz;k++){
1347       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[k]);
1348       v += bs2;
1349     }
1350   }
1351 
1352   /* backward solve the upper triangular */
1353   ls = a->solve_work + A->cmap->n;
1354   for (i=n-1; i>=0; i--){
1355     v  = aa + bs2*ai[2*n-i];
1356     vi = aj + ai[2*n-i];
1357     nz = ai[2*n-i +1] - ai[2*n-i]-1;
1358     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1359     for(k=0;k<nz;k++){
1360       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[k]);
1361       v += bs2;
1362     }
1363     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*adiag[i],t+i*bs); /* *inv(diagonal[i]) */
1364     ierr = PetscMemcpy(x+i*bs,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1365   }
1366 
1367   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1369   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 #undef __FUNCT__
1374 #define __FUNCT__ "MatSolve_SeqBAIJ_N_newdatastruct"
1375 PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx)
1376 {
1377   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1378   IS             iscol=a->col,isrow=a->row;
1379   PetscErrorCode ierr;
1380   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1381   PetscInt       i,m,n=a->mbs;
1382   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,k;
1383   MatScalar      *aa=a->a,*v;
1384   PetscScalar    *x,*b,*s,*t,*ls;
1385 
1386   PetscFunctionBegin;
1387   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1388   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1389   t  = a->solve_work;
1390 
1391   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1392   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1393 
1394   /* forward solve the lower triangular */
1395   ierr = PetscMemcpy(t,b+bs*r[0],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1396   for (i=1; i<n; i++) {
1397     v   = aa + bs2*ai[i];
1398     vi  = aj + ai[i];
1399     nz = ai[i+1] - ai[i];
1400     s = t + bs*i;
1401     ierr = PetscMemcpy(s,b+bs*r[i],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1402     for(m=0;m<nz;m++){
1403       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[m]);
1404       v += bs2;
1405     }
1406   }
1407 
1408   /* backward solve the upper triangular */
1409   ls = a->solve_work + A->cmap->n;
1410   for (i=n-1; i>=0; i--){
1411     k  = 2*n-i;
1412     v  = aa + bs2*ai[k];
1413     vi = aj + ai[k];
1414     nz = ai[k+1] - ai[k] - 1;
1415     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1416     for(m=0;m<nz;m++){
1417       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[m]);
1418       v += bs2;
1419     }
1420     Kernel_w_gets_A_times_v(bs,ls,v,t+i*bs); /* *inv(diagonal[i]) */
1421     ierr = PetscMemcpy(x + bs*c[i],t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1422   }
1423   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1424   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1425   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1426   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1427   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 #undef __FUNCT__
1432 #define __FUNCT__ "BlockAbs_privat"
1433 PetscErrorCode BlockAbs_private(PetscInt nbs,PetscInt bs2,PetscScalar *blockarray,PetscReal *absarray)
1434 {
1435   PetscErrorCode     ierr;
1436   PetscInt           i,j;
1437   PetscFunctionBegin;
1438   ierr = PetscMemzero(absarray,(nbs+1)*sizeof(PetscReal));CHKERRQ(ierr);
1439   for (i=0; i<nbs; i++){
1440     for (j=0; j<bs2; j++){
1441       if (absarray[i] < PetscAbsScalar(blockarray[i*nbs+j])) absarray[i] = PetscAbsScalar(blockarray[i*nbs+j]);
1442     }
1443   }
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 #undef __FUNCT__
1448 #define __FUNCT__ "MatILUDTFactor_SeqBAIJ"
1449 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A,IS isrow,IS iscol,const MatFactorInfo *info,Mat *fact)
1450 {
1451   Mat                B = *fact;
1452   Mat_SeqBAIJ        *a=(Mat_SeqBAIJ*)A->data,*b;
1453   IS                 isicol;
1454   PetscErrorCode     ierr;
1455   const PetscInt     *r,*ic;
1456   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,bs2=a->bs2,*ai=a->i,*aj=a->j,*ajtmp,*adiag;
1457   PetscInt           *bi,*bj,*bdiag;
1458 
1459   PetscInt           row,nzi,nzi_bl,nzi_bu,*im,dtcount,nzi_al,nzi_au;
1460   PetscInt           nlnk,*lnk;
1461   PetscBT            lnkbt;
1462   PetscTruth         row_identity,icol_identity,both_identity;
1463   MatScalar          *aatmp,*pv,*batmp,*ba,*rtmp,*pc,*multiplier,*vtmp;
1464   const PetscInt     *ics;
1465   PetscInt           j,nz,*pj,*bjtmp,k,ncut,*jtmp;
1466 
1467   PetscReal          dt=info->dt; /* shift=info->shiftinblocks; */
1468   PetscInt           nnz_max;
1469   PetscTruth         missing;
1470   PetscReal          *vtmp_abs;
1471   MatScalar          *v_work;
1472   PetscInt           *v_pivots;
1473 
1474   PetscFunctionBegin;
1475   /* ------- symbolic factorization, can be reused ---------*/
1476   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
1477   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
1478   adiag=a->diag;
1479 
1480   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
1481 
1482   /* bdiag is location of diagonal in factor */
1483   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
1484 
1485   /* allocate row pointers bi */
1486   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1487 
1488   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1489   dtcount = (PetscInt)info->dtcount;
1490   if (dtcount > mbs-1) dtcount = mbs-1;
1491   nnz_max  = ai[mbs]+2*mbs*dtcount +2;
1492   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1493   ierr = PetscMalloc(nnz_max*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1494   nnz_max = nnz_max*bs2;
1495   ierr = PetscMalloc(nnz_max*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1496 
1497   /* put together the new matrix */
1498   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1499   ierr = PetscLogObjectParent(B,isicol);CHKERRQ(ierr);
1500   b    = (Mat_SeqBAIJ*)(B)->data;
1501   b->free_a       = PETSC_TRUE;
1502   b->free_ij      = PETSC_TRUE;
1503   b->singlemalloc = PETSC_FALSE;
1504   b->a          = ba;
1505   b->j          = bj;
1506   b->i          = bi;
1507   b->diag       = bdiag;
1508   b->ilen       = 0;
1509   b->imax       = 0;
1510   b->row        = isrow;
1511   b->col        = iscol;
1512   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
1513   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
1514   b->icol       = isicol;
1515   ierr = PetscMalloc((bs*(mbs+1))*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1516 
1517   ierr = PetscLogObjectMemory(B,nnz_max*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1518   b->maxnz = nnz_max/bs2;
1519 
1520   (B)->factor                = MAT_FACTOR_ILUDT;
1521   (B)->info.factor_mallocs   = 0;
1522   (B)->info.fill_ratio_given = ((PetscReal)nnz_max)/((PetscReal)(ai[mbs]*bs2));
1523   CHKMEMQ;
1524   /* ------- end of symbolic factorization ---------*/
1525   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1526   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1527   ics  = ic;
1528 
1529   /* linked list for storing column indices of the active row */
1530   nlnk = mbs + 1;
1531   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1532 
1533   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1534   ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
1535   jtmp = im + mbs;
1536   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1537   ierr = PetscMalloc((2*mbs*bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1538   vtmp = rtmp + bs2*mbs;
1539   ierr = PetscMalloc((mbs+1)*sizeof(PetscReal),&vtmp_abs);CHKERRQ(ierr);
1540 
1541   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
1542   multiplier = v_work + bs;
1543   v_pivots   = (PetscInt*)(multiplier + bs2);
1544 
1545   bi[0]    = 0;
1546   bdiag[0] = (nnz_max/bs2)-1; /* location of diagonal in factor B */
1547   bi[2*mbs+1] = bdiag[0]+1; /* endof bj and ba array */
1548   for (i=0; i<mbs; i++) {
1549     /* copy initial fill into linked list */
1550     nzi = 0; /* nonzeros for active row i */
1551     nzi = ai[r[i]+1] - ai[r[i]];
1552     if (!nzi) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
1553     nzi_al = adiag[r[i]] - ai[r[i]];
1554     nzi_au = ai[r[i]+1] - adiag[r[i]] -1;
1555     /* printf("row %d, nzi_al/au %d %d\n",i,nzi_al,nzi_au); */
1556 
1557     /* load in initial unfactored row */
1558     ajtmp = aj + ai[r[i]];
1559     ierr = PetscLLAddPerm(nzi,ajtmp,ic,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1560     ierr = PetscMemzero(rtmp,(mbs*bs2+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1561     aatmp = a->a + bs2*ai[r[i]];
1562     for (j=0; j<nzi; j++) {
1563       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],aatmp+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1564     }
1565 
1566     /* add pivot rows into linked list */
1567     row = lnk[mbs];
1568     while (row < i) {
1569       nzi_bl = bi[row+1] - bi[row] + 1;
1570       bjtmp = bj + bdiag[row+1]+1; /* points to 1st column next to the diagonal in U */
1571       ierr  = PetscLLAddSortedLU(bjtmp,row,nlnk,lnk,lnkbt,i,nzi_bl,im);CHKERRQ(ierr);
1572       nzi  += nlnk;
1573       row   = lnk[row];
1574     }
1575 
1576     /* copy data from lnk into jtmp, then initialize lnk */
1577     ierr = PetscLLClean(mbs,mbs,nzi,lnk,jtmp,lnkbt);CHKERRQ(ierr);
1578 
1579     /* numerical factorization */
1580     bjtmp = jtmp;
1581     row   = *bjtmp++; /* 1st pivot row */
1582 
1583     while  (row < i) {
1584       pc = rtmp + bs2*row;
1585       pv = ba + bs2*bdiag[row]; /* inv(diag) of the pivot row */
1586       Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1587       ierr = BlockAbs_private(1,bs2,pc,vtmp_abs);CHKERRQ(ierr);
1588       if (vtmp_abs[0] > dt){ /* apply tolerance dropping rule */
1589         pj         = bj + bdiag[row+1] + 1; /* point to 1st entry of U(row,:) */
1590         pv         = ba + bs2*(bdiag[row+1] + 1);
1591         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries in U(row,:), excluding diagonal */
1592         for (j=0; j<nz; j++){
1593           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
1594         }
1595         /* ierr = PetscLogFlops(bslog*(nz+1.0)-bs);CHKERRQ(ierr); */
1596       }
1597       row = *bjtmp++;
1598     }
1599 
1600     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1601     nzi_bl = 0; j = 0;
1602     while (jtmp[j] < i){ /* L-part. Note: jtmp is sorted */
1603       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1604       nzi_bl++; j++;
1605     }
1606     nzi_bu = nzi - nzi_bl -1;
1607     /* printf("nzi %d, nzi_bl %d, nzi_bu %d\n",nzi,nzi_bl,nzi_bu); */
1608 
1609     while (j < nzi){ /* U-part */
1610       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1611       /*
1612       printf(" col %d: ",jtmp[j]);
1613       for (j1=0; j1<bs2; j1++) printf(" %g",*(vtmp+bs2*j+j1));
1614       printf(" \n");
1615       */
1616       j++;
1617     }
1618 
1619     ierr = BlockAbs_private(nzi,bs2,vtmp,vtmp_abs);CHKERRQ(ierr);
1620     /*
1621     printf(" row %d, nzi %d, vtmp_abs\n",i,nzi);
1622     for (j1=0; j1<nzi; j1++) printf(" (%d %g),",jtmp[j1],vtmp_abs[j1]);
1623     printf(" \n");
1624     */
1625     bjtmp = bj + bi[i];
1626     batmp = ba + bs2*bi[i];
1627     /* apply level dropping rule to L part */
1628     ncut = nzi_al + dtcount;
1629     if (ncut < nzi_bl){
1630       ierr = PetscSortSplitReal(ncut,nzi_bl,vtmp_abs,jtmp);CHKERRQ(ierr);
1631       ierr = PetscSortIntWithScalarArray(ncut,jtmp,vtmp);CHKERRQ(ierr);
1632     } else {
1633       ncut = nzi_bl;
1634     }
1635     for (j=0; j<ncut; j++){
1636       bjtmp[j] = jtmp[j];
1637       ierr = PetscMemcpy(batmp+bs2*j,rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1638       /*
1639       printf(" col %d: ",bjtmp[j]);
1640       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*j+j1));
1641       printf("\n");
1642       */
1643     }
1644     bi[i+1] = bi[i] + ncut;
1645     nzi = ncut + 1;
1646 
1647     /* apply level dropping rule to U part */
1648     ncut = nzi_au + dtcount;
1649     if (ncut < nzi_bu){
1650       ierr = PetscSortSplitReal(ncut,nzi_bu,vtmp_abs+nzi_bl+1,jtmp+nzi_bl+1);CHKERRQ(ierr);
1651       ierr = PetscSortIntWithScalarArray(ncut,jtmp+nzi_bl+1,vtmp+nzi_bl+1);CHKERRQ(ierr);
1652     } else {
1653       ncut = nzi_bu;
1654     }
1655     nzi += ncut;
1656 
1657     /* mark bdiagonal */
1658     bdiag[i+1]    = bdiag[i] - (ncut + 1);
1659     bi[2*mbs - i] = bi[2*mbs - i +1] - (ncut + 1);
1660 
1661     bjtmp = bj + bdiag[i];
1662     batmp = ba + bs2*bdiag[i];
1663     ierr = PetscMemcpy(batmp,rtmp+bs2*i,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1664     *bjtmp = i;
1665     /*
1666     printf(" diag %d: ",*bjtmp);
1667     for (j=0; j<bs2; j++){
1668       printf(" %g,",batmp[j]);
1669     }
1670     printf("\n");
1671     */
1672     bjtmp = bj + bdiag[i+1]+1;
1673     batmp = ba + (bdiag[i+1]+1)*bs2;
1674 
1675     for (k=0; k<ncut; k++){
1676       bjtmp[k] = jtmp[nzi_bl+1+k];
1677       ierr = PetscMemcpy(batmp+bs2*k,rtmp+bs2*bjtmp[k],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1678       /*
1679       printf(" col %d:",bjtmp[k]);
1680       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*k+j1));
1681       printf("\n");
1682       */
1683     }
1684 
1685     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1686 
1687     /* invert diagonal block for simplier triangular solves - add shift??? */
1688     batmp = ba + bs2*bdiag[i];
1689     ierr = Kernel_A_gets_inverse_A(bs,batmp,v_pivots,v_work);CHKERRQ(ierr);
1690   } /* for (i=0; i<mbs; i++) */
1691 
1692   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1693   if (bi[mbs] >= bdiag[mbs]) SETERRQ2(PETSC_ERR_ARG_SIZ,"end of L array %d cannot >= the beginning of U array %d",bi[mbs],bdiag[mbs]);
1694 
1695   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1696   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1697 
1698   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1699 
1700   ierr = PetscFree(im);CHKERRQ(ierr);
1701   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1702 
1703   ierr = PetscLogFlops(bs2*B->cmap->n);CHKERRQ(ierr);
1704   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1705 
1706   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1707   ierr = ISIdentity(isicol,&icol_identity);CHKERRQ(ierr);
1708   both_identity = (PetscTruth) (row_identity && icol_identity);
1709   if (row_identity && icol_identity) {
1710     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
1711   } else {
1712     B->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
1713   }
1714 
1715   B->ops->solveadd          = 0;
1716   B->ops->solvetranspose    = 0;
1717   B->ops->solvetransposeadd = 0;
1718   B->ops->matsolve          = 0;
1719   B->assembled              = PETSC_TRUE;
1720   B->preallocated           = PETSC_TRUE;
1721   PetscFunctionReturn(0);
1722 }
1723